## **Chapter One**
### Simple Expressions
2 + 4 # Addition
## [1] 6
4 / 2 # Division
## [1] 2
3 * 4 # Multiplication
## [1] 12
45 - 13 # Subtraction
## [1] 32
4L / 5L # Integer Division
## [1] 0.8
4 %% 3 # for integer division with remainder
## [1] 1
2 ^ 5 # Exponential
## [1] 32
3 ** 4 # Indices
## [1] 81
9 ** 3 # Indices
## [1] 729
s = "Hello Charles, I hope you are working smart to achieve your dream?"
s
## [1] "Hello Charles, I hope you are working smart to achieve your dream?"
e <- 34 # Assigning variable (e) to value (34)
e
## [1] 34
nchar(e) # number of characters
## [1] 2
3 -> r # Assigning variable (r) to value (3)
r
## [1] 3
(u <- "Nana Kwame") # parentheses form will print an output
## [1] "Nana Kwame"
nchar(u) # number of characters
## [1] 10
q = seq( 1,50,2) # Start from 1 and end at 50 with a step of 2
q
## [1] 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49
q[23] # checking for index of the sequence
## [1] 45
q[20] # checking for index of the sequence
## [1] 39
q[2:9] # checking form index 2 to index 9
## [1] 3 5 7 9 11 13 15 17
q ** 2 - r
## [1] -2 6 22 46 78 118 166 222 286 358 438 526 622 726 838
## [16] 958 1086 1222 1366 1518 1678 1846 2022 2206 2398
v <- c("A" = 1, "B" = 2, "C" = 3) # Creating keys and values
v
## A B C
## 1 2 3
names(v) <- c("x", "y", "z") # Assigning new keys to the values
v
## x y z
## 1 2 3
v["x"] # checking the key x
## x
## 1
v["y"] # checking the key y
## y
## 2
(x <- 1:4) # parentheses form will print an output
## [1] 1 2 3 4
(y <- 4:7) # parentheses form will print an output
## [1] 4 5 6 7
x - y
## [1] -3 -3 -3 -3
#?nchar
#?`+`
square <- function(x) x**2
square(1:4)
## [1] 1 4 9 16
cube = function(x) x**3
cube(1:9)
## [1] 1 8 27 64 125 216 343 512 729
square_and_subtract <- function(x, y) return(x ** 2 - y)
square_and_subtract(30,30)
## [1] 870
sum(1:4)
## [1] 10
1 : 4
## [1] 1 2 3 4
sqrt(1:4)
## [1] 1.000000 1.414214 1.732051 2.000000
average <- function(x) {
n <- length(x)
sum(x) / n
}
average(1:9)
## [1] 5
mean(1:9)
## [1] 5
mean <- function(x) sum(x) / length(x)
mean(2:24)
## [1] 13
### **Controls loops**
if (3 > 2) "true"
## [1] "true"
if (2 < 3) "false" else "true"
## [1] "false"
if (2 > 3) "bar" else "baz"
## [1] "baz"
if (5 > 4) "Charles" else "Nana"
## [1] "Charles"
if (2 > 3) {
x <- "bar"
}else "baz"
## [1] "baz"
x <- 1:5
ifelse(x > 3, "bar", "baz")
## [1] "baz" "baz" "baz" "bar" "bar"
maybe_square <- function(x) {
ifelse (x %% 2 == 0, x ** 2, x)
}
maybe_square(1:9)
## [1] 1 4 3 16 5 36 7 64 9
xo <- 1:5 # Assigning values (1 to 5) to variable xo
total <- 0
for (element in xo) total <- total + element
total
## [1] 15
xo
## [1] 1 2 3 4 5
x <- 1:5
total <- 0
for (index in seq_along(x)) {
element <- x[index]
total <- total + element
}
total
## [1] 15
x = -1:-5 # Assigning values (-1 to - 5) to variable x
total <- 9
index <- 1
while (index <= length(x)) {
element <- x[index]
index <- index + 1
total <- total + element
}
total
## [1] -6
x
## [1] -1 -2 -3 -4 -5
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5 -4 -3 -3 -2 -1
### Factors
f <- factor(c("small", "small", "medium",
"large", "small", "large"))
f
## [1] small small medium large small large
## Levels: large medium small
ordered(f)
## [1] small small medium large small large
## Levels: large < medium < small
summary(f)
## large medium small
## 2 1 3
ff <- factor(c("small", "small", "medium",
"large", "small", "large"),
levels = c("small", "medium", "large")) # changing the levels of ff
ff
## [1] small small medium large small large
## Levels: small medium large
ordered(ff)
## [1] small small medium large small large
## Levels: small < medium < large
summary(ff)
## small medium large
## 3 1 2
f <- factor(c("small", "small", "medium",
"large", "small", "large"),
levels = c("medium", "small", "large"),
ordered = TRUE)
f
## [1] small small medium large small large
## Levels: medium < small < large
summary(f)
## medium small large
## 1 3 2
v <- 1:26
names(v) <- LETTERS[1:26] # Extracting the name of the letters with numbers
v
## A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
(ffo <- factor(LETTERS[1:4]))
## [1] A B C D
## Levels: A B C D
v[ffo] # Assigning values to the letters
## A B C D
## 1 2 3 4
(ff <- factor(LETTERS[1:4], levels = rev(LETTERS[1:4])))
## [1] A B C D
## Levels: D C B A
v[ff]
## D C B A
## 4 3 2 1
as.numeric(ff) # Number form
## [1] 4 3 2 1
as.vector(ff) # String form
## [1] "A" "B" "C" "D"
v[as.vector(ff)]
## A B C D
## 1 2 3 4
### Data Frames
df <- data.frame(a = 1:4, b = letters[1:4]) # Creating a data frame
df
## a b
## 1 1 a
## 2 2 b
## 3 3 c
## 4 4 d
df[1,1] # row one , column one
## [1] 1
df[1,2] # row one , column two
## [1] "a"
df[2,1] # row two , column one
## [1] 2
df[2,2] # row two , column two
## [1] "b"
df[1,] # All row one values
## a b
## 1 1 a
df[,1] # All column one values
## [1] 1 2 3 4
df[,"a"] # using the column name
## [1] 1 2 3 4
df$b # using the column name
## [1] "a" "b" "c" "d"
df2 <- data.frame(a = 5:7, b = letters[5:7])
rbind(df, df2) ### Combining them using their rows
## a b
## 1 1 a
## 2 2 b
## 3 3 c
## 4 4 d
## 5 5 e
## 6 6 f
## 7 7 g
df3 <- data.frame(c = 5:8, d = letters[5:8])
cbind(df, df3) ### Combining them using their columns
## a b c d
## 1 1 a 5 e
## 2 2 b 6 f
## 3 3 c 7 g
## 4 4 d 8 h
df4 = data.frame(Random = rnorm(26), Alphabets = letters)
df4
## Random Alphabets
## 1 0.4516460 a
## 2 0.2163044 b
## 3 0.6436671 c
## 4 0.4485669 d
## 5 -1.8389144 e
## 6 -0.7360462 f
## 7 0.1452918 g
## 8 0.1854455 h
## 9 -0.0360053 i
## 10 0.1750531 j
## 11 1.0541459 k
## 12 -0.3875946 l
## 13 -0.8633137 m
## 14 -1.1431215 n
## 15 0.2610843 o
## 16 -0.7335032 p
## 17 -0.1958796 q
## 18 0.6539016 r
## 19 -1.4845533 s
## 20 -1.2848276 t
## 21 -0.2667454 u
## 22 -0.4209919 v
## 23 2.5683044 w
## 24 0.6854559 x
## 25 -0.4637876 y
## 26 -0.4577713 z
### Missing Values
NA + 5
## [1] NA
is.na(NA) # checking for a missing number
## [1] TRUE
is.na(4) # checking if 4 is a missing number
## [1] FALSE
v <- c(1:100,NA)
sum(v)
## [1] NA
sum(v, na.rm = TRUE)
## [1] 5050
### **R Packages**
#install.packages("magrittr")
library(magrittr)
### Data Pipeline
subsample_rows <- function(d, n) {
rows <- sample(nrow(d), n)
d[rows,]
}
d <- data.frame(x = rnorm(100), y = rnorm(100))
d %>% subsample_rows(n = 3)
## x y
## 56 0.92539129 0.3910950
## 65 0.09901638 0.3875896
## 96 -0.50833536 0.9330080
do <- data.frame(x = rnorm(100), y = rnorm(100))
do %>% lm(y ~ x, data = .)
##
## Call:
## lm(formula = y ~ x, data = .)
##
## Coefficients:
## (Intercept) x
## -0.25883 0.08946
so <- data.frame(x = 1:10, y = 11:20)
so %>% lm(y ~ x, data = .)
##
## Call:
## lm(formula = y ~ x, data = .)
##
## Coefficients:
## (Intercept) x
## 10 1
#?lm()
rnorm(4) %>% data.frame(x = ., is_negative = . < 0)
## x is_negative
## 1 0.4693503 FALSE
## 2 -0.6277748 TRUE
## 3 -0.6851523 TRUE
## 4 -0.4342983 TRUE
rnorm(4) %>% data.frame(x = ., y = abs(.))
## x y
## 1 0.9116599 0.9116599
## 2 1.1205732 1.1205732
## 3 -1.1213551 1.1213551
## 4 1.9892887 1.9892887
rnorm(4) %>% data.frame(x = sin(.), y = cos(.))
## . x y
## 1 -0.9570749 -0.8175104 0.575913785
## 2 -0.8458195 -0.7485148 0.663118106
## 3 0.2112488 0.2096811 0.977769828
## 4 -1.5766002 -0.9999832 -0.005803872
#sin(1.03130453)
#cos(1.03130453)
f <- function(x, y) {
y <- sum(cos(x),sin(y))
z <- sum(sin(x),cos(y))
z
}
f(2,4)
## [1] 1.296732
plot_and_fit <- function(d) {
plot(y ~ x, data = d)
abline(lm(y ~ x, data = d))
}
x <- rnorm(20)
y <- x + rnorm(20)
data.frame(x, y) %>% plot_and_fit
data.frame(x, y) %>% (function(d) {
plot(y ~ x, data = d)
abline(lm(y ~ x, data = d))
})
data.frame(x, y) %>% {
plot(y ~ x, data = .)
abline(lm(y ~ x, data = .))
}
d <- data.frame(x = rnorm(100), y = 4 + rnorm(100))
#d %>% {data.frame(mean_x = mean(.$x), mean_y = mean(.$y))}
d %$% data.frame(mean_x = mean(x), mean_y = mean(y))
## mean_x mean_y
## 1 -0.08483521 4.043954
d <- data.frame(x = rnorm(10), y = rnorm(10))
d %T>% plot(y ~ x, data = .) %>% lm(y ~ x, data = .)
##
## Call:
## lm(formula = y ~ x, data = .)
##
## Coefficients:
## (Intercept) x
## 0.1483 0.1683
## **Chapter Three**
### Data Manipulation
library(datasets)
library(ggplot2)
#library(help = "datasets")
data(cars)
head(cars)
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
cars %>% head(3) ### First 3 rows of the cars datasets
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
cars %>% tail(3) ### Last 3 rows of the cars datasets
## speed dist
## 48 24 93
## 49 24 120
## 50 25 85
cars %>% summary ### Summary of the cars datasets
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
cars %>% qplot(speed, dist, data = .)
#install.packages("mlbench")
library(mlbench)
#library(help = "mlbench")
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
iris %>% head(5)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
iris %>% summary ### Summary of the iris datasets
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
iris %>% qplot(Sepal.Length, Sepal.Width, data = ., color = Species)
iris %>% qplot(Petal.Length, Petal.Width, data = ., color = Species)
#?read.table
#### **Breast Cancer**
data(BreastCancer)
BreastCancer %>% head()
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 benign
## 2 10 3 2 1 benign
## 3 2 3 1 1 benign
## 4 4 3 7 1 benign
## 5 1 3 1 1 benign
## 6 10 9 7 1 malignant
data_url = "http://tinyurl.com/kw4xtts"
lines <- readLines(data_url)
lines[1:5]
## [1] "1000025,5,1,1,1,2,1,3,1,1,2" "1002945,5,4,4,5,7,10,3,2,1,2"
## [3] "1015425,3,1,1,1,2,2,3,1,1,2" "1016277,6,8,8,1,3,4,3,7,1,2"
## [5] "1017023,4,1,1,3,2,1,3,1,1,2"
raw_breast_cancer = read.csv("C:/Users/HP/Desktop/Data Science/Machine learning/breast+cancer+wisconsin+original/breast-cancer-wisconsin.data")
raw_breast_cancer %>% head(6)
## X1000025 X5 X1 X1.1 X1.2 X2 X1.3 X3 X1.4 X1.5 X2.1
## 1 1002945 5 4 4 5 7 10 3 2 1 2
## 2 1015425 3 1 1 1 2 2 3 1 1 2
## 3 1016277 6 8 8 1 3 4 3 7 1 2
## 4 1017023 4 1 1 3 2 1 3 1 1 2
## 5 1017122 8 10 10 8 7 10 9 7 1 4
## 6 1018099 1 1 1 1 2 10 3 1 1 2
raw_breast_cancer <- read.csv(data_url)
raw_breast_cancer %>% head()
## X1000025 X5 X1 X1.1 X1.2 X2 X1.3 X3 X1.4 X1.5 X2.1
## 1 1002945 5 4 4 5 7 10 3 2 1 2
## 2 1015425 3 1 1 1 2 2 3 1 1 2
## 3 1016277 6 8 8 1 3 4 3 7 1 2
## 4 1017023 4 1 1 3 2 1 3 1 1 2
## 5 1017122 8 10 10 8 7 10 9 7 1 4
## 6 1018099 1 1 1 1 2 10 3 1 1 2
raw_breast_cancer <- read.csv(data_url, header = FALSE)
raw_breast_cancer %>% head()
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
## 1 1000025 5 1 1 1 2 1 3 1 1 2
## 2 1002945 5 4 4 5 7 10 3 2 1 2
## 3 1015425 3 1 1 1 2 2 3 1 1 2
## 4 1016277 6 8 8 1 3 4 3 7 1 2
## 5 1017023 4 1 1 3 2 1 3 1 1 2
## 6 1017122 8 10 10 8 7 10 9 7 1 4
names(raw_breast_cancer) <- names(BreastCancer)
raw_breast_cancer %>% head()
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 2
## 2 10 3 2 1 2
## 3 2 3 1 1 2
## 4 4 3 7 1 2
## 5 1 3 1 1 2
## 6 10 9 7 1 4
raw_breast_cancer <- read.csv(data_url, header = FALSE,
col.names = names(BreastCancer))
raw_breast_cancer %>% head()
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 2
## 2 10 3 2 1 2
## 3 2 3 1 1 2
## 4 4 3 7 1 2
## 5 1 3 1 1 2
## 6 10 9 7 1 4
formatted_breast_cancer <- raw_breast_cancer
#mapped <- formatted_breast_cancer$Class %>% map_class
#mapped %>% table
map_class <- function(x) {
ifelse(x == 2, "bening", "malignant")
}
mapped <- formatted_breast_cancer$Class %>% map_class
mapped %>% table
## .
## bening malignant
## 458 241
formatted_breast_cancer$Class %>% unique
## [1] 2 4
#dict <- c("2" = "benign", "4" = "malignant")
#map_class <- function(x) dict[as.character(x)]
#mapped <- formatted_breast_cancer$Class %>% map_class
#mapped %>% table
mapped[1:20]
## [1] "bening" "bening" "bening" "bening" "bening" "malignant"
## [7] "bening" "bening" "bening" "bening" "bening" "bening"
## [13] "malignant" "bening" "malignant" "malignant" "bening" "bening"
## [19] "malignant" "bening"
#mapped %<>% unname
#mapped[1:20]
read = read.csv(data_url, header = FALSE,
col.names = names(BreastCancer)) ->
raw_breast_cancer ->
formatted_breast_cancer
dict <- c("2" = "benign", "4" = "malignant")
map_class <- function(x) dict[as.character(x)]
formatted_breast_cancer$Class <-
formatted_breast_cancer$Class %>%
map_class %>%
unname %>%
factor(levels = c("benign", "malignant"))
read
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## 7 1018099 1 1 1 1 2
## 8 1018561 2 1 2 1 2
## 9 1033078 2 1 1 1 2
## 10 1033078 4 2 1 1 2
## 11 1035283 1 1 1 1 1
## 12 1036172 2 1 1 1 2
## 13 1041801 5 3 3 3 2
## 14 1043999 1 1 1 1 2
## 15 1044572 8 7 5 10 7
## 16 1047630 7 4 6 4 6
## 17 1048672 4 1 1 1 2
## 18 1049815 4 1 1 1 2
## 19 1050670 10 7 7 6 4
## 20 1050718 6 1 1 1 2
## 21 1054590 7 3 2 10 5
## 22 1054593 10 5 5 3 6
## 23 1056784 3 1 1 1 2
## 24 1057013 8 4 5 1 2
## 25 1059552 1 1 1 1 2
## 26 1065726 5 2 3 4 2
## 27 1066373 3 2 1 1 1
## 28 1066979 5 1 1 1 2
## 29 1067444 2 1 1 1 2
## 30 1070935 1 1 3 1 2
## 31 1070935 3 1 1 1 1
## 32 1071760 2 1 1 1 2
## 33 1072179 10 7 7 3 8
## 34 1074610 2 1 1 2 2
## 35 1075123 3 1 2 1 2
## 36 1079304 2 1 1 1 2
## 37 1080185 10 10 10 8 6
## 38 1081791 6 2 1 1 1
## 39 1084584 5 4 4 9 2
## 40 1091262 2 5 3 3 6
## 41 1096800 6 6 6 9 6
## 42 1099510 10 4 3 1 3
## 43 1100524 6 10 10 2 8
## 44 1102573 5 6 5 6 10
## 45 1103608 10 10 10 4 8
## 46 1103722 1 1 1 1 2
## 47 1105257 3 7 7 4 4
## 48 1105524 1 1 1 1 2
## 49 1106095 4 1 1 3 2
## 50 1106829 7 8 7 2 4
## 51 1108370 9 5 8 1 2
## 52 1108449 5 3 3 4 2
## 53 1110102 10 3 6 2 3
## 54 1110503 5 5 5 8 10
## 55 1110524 10 5 5 6 8
## 56 1111249 10 6 6 3 4
## 57 1112209 8 10 10 1 3
## 58 1113038 8 2 4 1 5
## 59 1113483 5 2 3 1 6
## 60 1113906 9 5 5 2 2
## 61 1115282 5 3 5 5 3
## 62 1115293 1 1 1 1 2
## 63 1116116 9 10 10 1 10
## 64 1116132 6 3 4 1 5
## 65 1116192 1 1 1 1 2
## 66 1116998 10 4 2 1 3
## 67 1117152 4 1 1 1 2
## 68 1118039 5 3 4 1 8
## 69 1120559 8 3 8 3 4
## 70 1121732 1 1 1 1 2
## 71 1121919 5 1 3 1 2
## 72 1123061 6 10 2 8 10
## 73 1124651 1 3 3 2 2
## 74 1125035 9 4 5 10 6
## 75 1126417 10 6 4 1 3
## 76 1131294 1 1 2 1 2
## 77 1132347 1 1 4 1 2
## 78 1133041 5 3 1 2 2
## 79 1133136 3 1 1 1 2
## 80 1136142 2 1 1 1 3
## 81 1137156 2 2 2 1 1
## 82 1143978 4 1 1 2 2
## 83 1143978 5 2 1 1 2
## 84 1147044 3 1 1 1 2
## 85 1147699 3 5 7 8 8
## 86 1147748 5 10 6 1 10
## 87 1148278 3 3 6 4 5
## 88 1148873 3 6 6 6 5
## 89 1152331 4 1 1 1 2
## 90 1155546 2 1 1 2 3
## 91 1156272 1 1 1 1 2
## 92 1156948 3 1 1 2 2
## 93 1157734 4 1 1 1 2
## 94 1158247 1 1 1 1 2
## 95 1160476 2 1 1 1 2
## 96 1164066 1 1 1 1 2
## 97 1165297 2 1 1 2 2
## 98 1165790 5 1 1 1 2
## 99 1165926 9 6 9 2 10
## 100 1166630 7 5 6 10 5
## 101 1166654 10 3 5 1 10
## 102 1167439 2 3 4 4 2
## 103 1167471 4 1 2 1 2
## 104 1168359 8 2 3 1 6
## 105 1168736 10 10 10 10 10
## 106 1169049 7 3 4 4 3
## 107 1170419 10 10 10 8 2
## 108 1170420 1 6 8 10 8
## 109 1171710 1 1 1 1 2
## 110 1171710 6 5 4 4 3
## 111 1171795 1 3 1 2 2
## 112 1171845 8 6 4 3 5
## 113 1172152 10 3 3 10 2
## 114 1173216 10 10 10 3 10
## 115 1173235 3 3 2 1 2
## 116 1173347 1 1 1 1 2
## 117 1173347 8 3 3 1 2
## 118 1173509 4 5 5 10 4
## 119 1173514 1 1 1 1 4
## 120 1173681 3 2 1 1 2
## 121 1174057 1 1 2 2 2
## 122 1174057 4 2 1 1 2
## 123 1174131 10 10 10 2 10
## 124 1174428 5 3 5 1 8
## 125 1175937 5 4 6 7 9
## 126 1176406 1 1 1 1 2
## 127 1176881 7 5 3 7 4
## 128 1177027 3 1 1 1 2
## 129 1177399 8 3 5 4 5
## 130 1177512 1 1 1 1 10
## 131 1178580 5 1 3 1 2
## 132 1179818 2 1 1 1 2
## 133 1180194 5 10 8 10 8
## 134 1180523 3 1 1 1 2
## 135 1180831 3 1 1 1 3
## 136 1181356 5 1 1 1 2
## 137 1182404 4 1 1 1 2
## 138 1182410 3 1 1 1 2
## 139 1183240 4 1 2 1 2
## 140 1183246 1 1 1 1 1
## 141 1183516 3 1 1 1 2
## 142 1183911 2 1 1 1 2
## 143 1183983 9 5 5 4 4
## 144 1184184 1 1 1 1 2
## 145 1184241 2 1 1 1 2
## 146 1184840 1 1 3 1 2
## 147 1185609 3 4 5 2 6
## 148 1185610 1 1 1 1 3
## 149 1187457 3 1 1 3 8
## 150 1187805 8 8 7 4 10
## 151 1188472 1 1 1 1 1
## 152 1189266 7 2 4 1 6
## 153 1189286 10 10 8 6 4
## 154 1190394 4 1 1 1 2
## 155 1190485 1 1 1 1 2
## 156 1192325 5 5 5 6 3
## 157 1193091 1 2 2 1 2
## 158 1193210 2 1 1 1 2
## 159 1193683 1 1 2 1 3
## 160 1196295 9 9 10 3 6
## 161 1196915 10 7 7 4 5
## 162 1197080 4 1 1 1 2
## 163 1197270 3 1 1 1 2
## 164 1197440 1 1 1 2 1
## 165 1197510 5 1 1 1 2
## 166 1197979 4 1 1 1 2
## 167 1197993 5 6 7 8 8
## 168 1198128 10 8 10 10 6
## 169 1198641 3 1 1 1 2
## 170 1199219 1 1 1 2 1
## 171 1199731 3 1 1 1 2
## 172 1199983 1 1 1 1 2
## 173 1200772 1 1 1 1 2
## 174 1200847 6 10 10 10 8
## 175 1200892 8 6 5 4 3
## 176 1200952 5 8 7 7 10
## 177 1201834 2 1 1 1 2
## 178 1201936 5 10 10 3 8
## 179 1202125 4 1 1 1 2
## 180 1202812 5 3 3 3 6
## 181 1203096 1 1 1 1 1
## 182 1204242 1 1 1 1 2
## 183 1204898 6 1 1 1 2
## 184 1205138 5 8 8 8 5
## 185 1205579 8 7 6 4 4
## 186 1206089 2 1 1 1 1
## 187 1206695 1 5 8 6 5
## 188 1206841 10 5 6 10 6
## 189 1207986 5 8 4 10 5
## 190 1208301 1 2 3 1 2
## 191 1210963 10 10 10 8 6
## 192 1211202 7 5 10 10 10
## 193 1212232 5 1 1 1 2
## 194 1212251 1 1 1 1 2
## 195 1212422 3 1 1 1 2
## 196 1212422 4 1 1 1 2
## 197 1213375 8 4 4 5 4
## 198 1213383 5 1 1 4 2
## 199 1214092 1 1 1 1 2
## 200 1214556 3 1 1 1 2
## 201 1214966 9 7 7 5 5
## 202 1216694 10 8 8 4 10
## 203 1216947 1 1 1 1 2
## 204 1217051 5 1 1 1 2
## 205 1217264 1 1 1 1 2
## 206 1218105 5 10 10 9 6
## 207 1218741 10 10 9 3 7
## 208 1218860 1 1 1 1 1
## 209 1218860 1 1 1 1 1
## 210 1219406 5 1 1 1 1
## 211 1219525 8 10 10 10 5
## 212 1219859 8 10 8 8 4
## 213 1220330 1 1 1 1 2
## 214 1221863 10 10 10 10 7
## 215 1222047 10 10 10 10 3
## 216 1222936 8 7 8 7 5
## 217 1223282 1 1 1 1 2
## 218 1223426 1 1 1 1 2
## 219 1223793 6 10 7 7 6
## 220 1223967 6 1 3 1 2
## 221 1224329 1 1 1 2 2
## 222 1225799 10 6 4 3 10
## 223 1226012 4 1 1 3 1
## 224 1226612 7 5 6 3 3
## 225 1227210 10 5 5 6 3
## 226 1227244 1 1 1 1 2
## 227 1227481 10 5 7 4 4
## 228 1228152 8 9 9 5 3
## 229 1228311 1 1 1 1 1
## 230 1230175 10 10 10 3 10
## 231 1230688 7 4 7 4 3
## 232 1231387 6 8 7 5 6
## 233 1231706 8 4 6 3 3
## 234 1232225 10 4 5 5 5
## 235 1236043 3 3 2 1 3
## 236 1241232 3 1 4 1 2
## 237 1241559 10 8 8 2 8
## 238 1241679 9 8 8 5 6
## 239 1242364 8 10 10 8 6
## 240 1243256 10 4 3 2 3
## 241 1270479 5 1 3 3 2
## 242 1276091 3 1 1 3 1
## 243 1277018 2 1 1 1 2
## 244 128059 1 1 1 1 2
## 245 1285531 1 1 1 1 2
## 246 1287775 5 1 1 2 2
## 247 144888 8 10 10 8 5
## 248 145447 8 4 4 1 2
## 249 167528 4 1 1 1 2
## 250 169356 3 1 1 1 2
## 251 183913 1 2 2 1 2
## 252 191250 10 4 4 10 2
## 253 1017023 6 3 3 5 3
## 254 1100524 6 10 10 2 8
## 255 1116116 9 10 10 1 10
## 256 1168736 5 6 6 2 4
## 257 1182404 3 1 1 1 2
## 258 1182404 3 1 1 1 2
## 259 1198641 3 1 1 1 2
## 260 242970 5 7 7 1 5
## 261 255644 10 5 8 10 3
## 262 263538 5 10 10 6 10
## 263 274137 8 8 9 4 5
## 264 303213 10 4 4 10 6
## 265 314428 7 9 4 10 10
## 266 1182404 5 1 4 1 2
## 267 1198641 10 10 6 3 3
## 268 320675 3 3 5 2 3
## 269 324427 10 8 8 2 3
## 270 385103 1 1 1 1 2
## 271 390840 8 4 7 1 3
## 272 411453 5 1 1 1 2
## 273 320675 3 3 5 2 3
## 274 428903 7 2 4 1 3
## 275 431495 3 1 1 1 2
## 276 432809 3 1 3 1 2
## 277 434518 3 1 1 1 2
## 278 452264 1 1 1 1 2
## 279 456282 1 1 1 1 2
## 280 476903 10 5 7 3 3
## 281 486283 3 1 1 1 2
## 282 486662 2 1 1 2 2
## 283 488173 1 4 3 10 4
## 284 492268 10 4 6 1 2
## 285 508234 7 4 5 10 2
## 286 527363 8 10 10 10 8
## 287 529329 10 10 10 10 10
## 288 535331 3 1 1 1 3
## 289 543558 6 1 3 1 4
## 290 555977 5 6 6 8 6
## 291 560680 1 1 1 1 2
## 292 561477 1 1 1 1 2
## 293 563649 8 8 8 1 2
## 294 601265 10 4 4 6 2
## 295 606140 1 1 1 1 2
## 296 606722 5 5 7 8 6
## 297 616240 5 3 4 3 4
## 298 61634 5 4 3 1 2
## 299 625201 8 2 1 1 5
## 300 63375 9 1 2 6 4
## 301 635844 8 4 10 5 4
## 302 636130 1 1 1 1 2
## 303 640744 10 10 10 7 9
## 304 646904 1 1 1 1 2
## 305 653777 8 3 4 9 3
## 306 659642 10 8 4 4 4
## 307 666090 1 1 1 1 2
## 308 666942 1 1 1 1 2
## 309 667204 7 8 7 6 4
## 310 673637 3 1 1 1 2
## 311 684955 2 1 1 1 3
## 312 688033 1 1 1 1 2
## 313 691628 8 6 4 10 10
## 314 693702 1 1 1 1 2
## 315 704097 1 1 1 1 1
## 316 704168 4 6 5 6 7
## 317 706426 5 5 5 2 5
## 318 709287 6 8 7 8 6
## 319 718641 1 1 1 1 5
## 320 721482 4 4 4 4 6
## 321 730881 7 6 3 2 5
## 322 733639 3 1 1 1 2
## 323 733639 3 1 1 1 2
## 324 733823 5 4 6 10 2
## 325 740492 1 1 1 1 2
## 326 743348 3 2 2 1 2
## 327 752904 10 1 1 1 2
## 328 756136 1 1 1 1 2
## 329 760001 8 10 3 2 6
## 330 760239 10 4 6 4 5
## 331 76389 10 4 7 2 2
## 332 764974 5 1 1 1 2
## 333 770066 5 2 2 2 2
## 334 785208 5 4 6 6 4
## 335 785615 8 6 7 3 3
## 336 792744 1 1 1 1 2
## 337 797327 6 5 5 8 4
## 338 798429 1 1 1 1 2
## 339 704097 1 1 1 1 1
## 340 806423 8 5 5 5 2
## 341 809912 10 3 3 1 2
## 342 810104 1 1 1 1 2
## 343 814265 2 1 1 1 2
## 344 814911 1 1 1 1 2
## 345 822829 7 6 4 8 10
## 346 826923 1 1 1 1 2
## 347 830690 5 2 2 2 3
## 348 831268 1 1 1 1 1
## 349 832226 3 4 4 10 5
## 350 832567 4 2 3 5 3
## 351 836433 5 1 1 3 2
## 352 837082 2 1 1 1 2
## 353 846832 3 4 5 3 7
## 354 850831 2 7 10 10 7
## 355 855524 1 1 1 1 2
## 356 857774 4 1 1 1 3
## 357 859164 5 3 3 1 3
## 358 859350 8 10 10 7 10
## 359 866325 8 10 5 3 8
## 360 873549 10 3 5 4 3
## 361 877291 6 10 10 10 10
## 362 877943 3 10 3 10 6
## 363 888169 3 2 2 1 4
## 364 888523 4 4 4 2 2
## 365 896404 2 1 1 1 2
## 366 897172 2 1 1 1 2
## 367 95719 6 10 10 10 8
## 368 160296 5 8 8 10 5
## 369 342245 1 1 3 1 2
## 370 428598 1 1 3 1 1
## 371 492561 4 3 2 1 3
## 372 493452 1 1 3 1 2
## 373 493452 4 1 2 1 2
## 374 521441 5 1 1 2 2
## 375 560680 3 1 2 1 2
## 376 636437 1 1 1 1 2
## 377 640712 1 1 1 1 2
## 378 654244 1 1 1 1 1
## 379 657753 3 1 1 4 3
## 380 685977 5 3 4 1 4
## 381 805448 1 1 1 1 2
## 382 846423 10 6 3 6 4
## 383 1002504 3 2 2 2 2
## 384 1022257 2 1 1 1 2
## 385 1026122 2 1 1 1 2
## 386 1071084 3 3 2 2 3
## 387 1080233 7 6 6 3 2
## 388 1114570 5 3 3 2 3
## 389 1114570 2 1 1 1 2
## 390 1116715 5 1 1 1 3
## 391 1131411 1 1 1 2 2
## 392 1151734 10 8 7 4 3
## 393 1156017 3 1 1 1 2
## 394 1158247 1 1 1 1 1
## 395 1158405 1 2 3 1 2
## 396 1168278 3 1 1 1 2
## 397 1176187 3 1 1 1 2
## 398 1196263 4 1 1 1 2
## 399 1196475 3 2 1 1 2
## 400 1206314 1 2 3 1 2
## 401 1211265 3 10 8 7 6
## 402 1213784 3 1 1 1 2
## 403 1223003 5 3 3 1 2
## 404 1223306 3 1 1 1 2
## 405 1223543 1 2 1 3 2
## 406 1229929 1 1 1 1 2
## 407 1231853 4 2 2 1 2
## 408 1234554 1 1 1 1 2
## 409 1236837 2 3 2 2 2
## 410 1237674 3 1 2 1 2
## 411 1238021 1 1 1 1 2
## 412 1238464 1 1 1 1 1
## 413 1238633 10 10 10 6 8
## 414 1238915 5 1 2 1 2
## 415 1238948 8 5 6 2 3
## 416 1239232 3 3 2 6 3
## 417 1239347 8 7 8 5 10
## 418 1239967 1 1 1 1 2
## 419 1240337 5 2 2 2 2
## 420 1253505 2 3 1 1 5
## 421 1255384 3 2 2 3 2
## 422 1257200 10 10 10 7 10
## 423 1257648 4 3 3 1 2
## 424 1257815 5 1 3 1 2
## 425 1257938 3 1 1 1 2
## 426 1258549 9 10 10 10 10
## 427 1258556 5 3 6 1 2
## 428 1266154 8 7 8 2 4
## 429 1272039 1 1 1 1 2
## 430 1276091 2 1 1 1 2
## 431 1276091 1 3 1 1 2
## 432 1276091 5 1 1 3 4
## 433 1277629 5 1 1 1 2
## 434 1293439 3 2 2 3 2
## 435 1293439 6 9 7 5 5
## 436 1294562 10 8 10 1 3
## 437 1295186 10 10 10 1 6
## 438 527337 4 1 1 1 2
## 439 558538 4 1 3 3 2
## 440 566509 5 1 1 1 2
## 441 608157 10 4 3 10 4
## 442 677910 5 2 2 4 2
## 443 734111 1 1 1 3 2
## 444 734111 1 1 1 1 2
## 445 780555 5 1 1 6 3
## 446 827627 2 1 1 1 2
## 447 1049837 1 1 1 1 2
## 448 1058849 5 1 1 1 2
## 449 1182404 1 1 1 1 1
## 450 1193544 5 7 9 8 6
## 451 1201870 4 1 1 3 1
## 452 1202253 5 1 1 1 2
## 453 1227081 3 1 1 3 2
## 454 1230994 4 5 5 8 6
## 455 1238410 2 3 1 1 3
## 456 1246562 10 2 2 1 2
## 457 1257470 10 6 5 8 5
## 458 1259008 8 8 9 6 6
## 459 1266124 5 1 2 1 2
## 460 1267898 5 1 3 1 2
## 461 1268313 5 1 1 3 2
## 462 1268804 3 1 1 1 2
## 463 1276091 6 1 1 3 2
## 464 1280258 4 1 1 1 2
## 465 1293966 4 1 1 1 2
## 466 1296572 10 9 8 7 6
## 467 1298416 10 6 6 2 4
## 468 1299596 6 6 6 5 4
## 469 1105524 4 1 1 1 2
## 470 1181685 1 1 2 1 2
## 471 1211594 3 1 1 1 1
## 472 1238777 6 1 1 3 2
## 473 1257608 6 1 1 1 1
## 474 1269574 4 1 1 1 2
## 475 1277145 5 1 1 1 2
## 476 1287282 3 1 1 1 2
## 477 1296025 4 1 2 1 2
## 478 1296263 4 1 1 1 2
## 479 1296593 5 2 1 1 2
## 480 1299161 4 8 7 10 4
## 481 1301945 5 1 1 1 1
## 482 1302428 5 3 2 4 2
## 483 1318169 9 10 10 10 10
## 484 474162 8 7 8 5 5
## 485 787451 5 1 2 1 2
## 486 1002025 1 1 1 3 1
## 487 1070522 3 1 1 1 1
## 488 1073960 10 10 10 10 6
## 489 1076352 3 6 4 10 3
## 490 1084139 6 3 2 1 3
## 491 1115293 1 1 1 1 2
## 492 1119189 5 8 9 4 3
## 493 1133991 4 1 1 1 1
## 494 1142706 5 10 10 10 6
## 495 1155967 5 1 2 10 4
## 496 1170945 3 1 1 1 1
## 497 1181567 1 1 1 1 1
## 498 1182404 4 2 1 1 2
## 499 1204558 4 1 1 1 2
## 500 1217952 4 1 1 1 2
## 501 1224565 6 1 1 1 2
## 502 1238186 4 1 1 1 2
## 503 1253917 4 1 1 2 2
## 504 1265899 4 1 1 1 2
## 505 1268766 1 1 1 1 2
## 506 1277268 3 3 1 1 2
## 507 1286943 8 10 10 10 7
## 508 1295508 1 1 1 1 2
## 509 1297327 5 1 1 1 2
## 510 1297522 2 1 1 1 2
## 511 1298360 1 1 1 1 2
## 512 1299924 5 1 1 1 2
## 513 1299994 5 1 1 1 2
## 514 1304595 3 1 1 1 1
## 515 1306282 6 6 7 10 3
## 516 1313325 4 10 4 7 3
## 517 1320077 1 1 1 1 1
## 518 1320077 1 1 1 1 1
## 519 1320304 3 1 2 2 2
## 520 1330439 4 7 8 3 4
## 521 333093 1 1 1 1 3
## 522 369565 4 1 1 1 3
## 523 412300 10 4 5 4 3
## 524 672113 7 5 6 10 4
## 525 749653 3 1 1 1 2
## 526 769612 3 1 1 2 2
## 527 769612 4 1 1 1 2
## 528 798429 4 1 1 1 2
## 529 807657 6 1 3 2 2
## 530 8233704 4 1 1 1 1
## 531 837480 7 4 4 3 4
## 532 867392 4 2 2 1 2
## 533 869828 1 1 1 1 1
## 534 1043068 3 1 1 1 2
## 535 1056171 2 1 1 1 2
## 536 1061990 1 1 3 2 2
## 537 1113061 5 1 1 1 2
## 538 1116192 5 1 2 1 2
## 539 1135090 4 1 1 1 2
## 540 1145420 6 1 1 1 2
## 541 1158157 5 1 1 1 2
## 542 1171578 3 1 1 1 2
## 543 1174841 5 3 1 1 2
## 544 1184586 4 1 1 1 2
## 545 1186936 2 1 3 2 2
## 546 1197527 5 1 1 1 2
## 547 1222464 6 10 10 10 4
## 548 1240603 2 1 1 1 1
## 549 1240603 3 1 1 1 1
## 550 1241035 7 8 3 7 4
## 551 1287971 3 1 1 1 2
## 552 1289391 1 1 1 1 2
## 553 1299924 3 2 2 2 2
## 554 1306339 4 4 2 1 2
## 555 1313658 3 1 1 1 2
## 556 1313982 4 3 1 1 2
## 557 1321264 5 2 2 2 1
## 558 1321321 5 1 1 3 2
## 559 1321348 2 1 1 1 2
## 560 1321931 5 1 1 1 2
## 561 1321942 5 1 1 1 2
## 562 1321942 5 1 1 1 2
## 563 1328331 1 1 1 1 2
## 564 1328755 3 1 1 1 2
## 565 1331405 4 1 1 1 2
## 566 1331412 5 7 10 10 5
## 567 1333104 3 1 2 1 2
## 568 1334071 4 1 1 1 2
## 569 1343068 8 4 4 1 6
## 570 1343374 10 10 8 10 6
## 571 1344121 8 10 4 4 8
## 572 142932 7 6 10 5 3
## 573 183936 3 1 1 1 2
## 574 324382 1 1 1 1 2
## 575 378275 10 9 7 3 4
## 576 385103 5 1 2 1 2
## 577 690557 5 1 1 1 2
## 578 695091 1 1 1 1 2
## 579 695219 1 1 1 1 2
## 580 824249 1 1 1 1 2
## 581 871549 5 1 2 1 2
## 582 878358 5 7 10 6 5
## 583 1107684 6 10 5 5 4
## 584 1115762 3 1 1 1 2
## 585 1217717 5 1 1 6 3
## 586 1239420 1 1 1 1 2
## 587 1254538 8 10 10 10 6
## 588 1261751 5 1 1 1 2
## 589 1268275 9 8 8 9 6
## 590 1272166 5 1 1 1 2
## 591 1294261 4 10 8 5 4
## 592 1295529 2 5 7 6 4
## 593 1298484 10 3 4 5 3
## 594 1311875 5 1 2 1 2
## 595 1315506 4 8 6 3 4
## 596 1320141 5 1 1 1 2
## 597 1325309 4 1 2 1 2
## 598 1333063 5 1 3 1 2
## 599 1333495 3 1 1 1 2
## 600 1334659 5 2 4 1 1
## 601 1336798 3 1 1 1 2
## 602 1344449 1 1 1 1 1
## 603 1350568 4 1 1 1 2
## 604 1352663 5 4 6 8 4
## 605 188336 5 3 2 8 5
## 606 352431 10 5 10 3 5
## 607 353098 4 1 1 2 2
## 608 411453 1 1 1 1 2
## 609 557583 5 10 10 10 10
## 610 636375 5 1 1 1 2
## 611 736150 10 4 3 10 3
## 612 803531 5 10 10 10 5
## 613 822829 8 10 10 10 6
## 614 1016634 2 3 1 1 2
## 615 1031608 2 1 1 1 1
## 616 1041043 4 1 3 1 2
## 617 1042252 3 1 1 1 2
## 618 1057067 1 1 1 1 1
## 619 1061990 4 1 1 1 2
## 620 1073836 5 1 1 1 2
## 621 1083817 3 1 1 1 2
## 622 1096352 6 3 3 3 3
## 623 1140597 7 1 2 3 2
## 624 1149548 1 1 1 1 2
## 625 1174009 5 1 1 2 1
## 626 1183596 3 1 3 1 3
## 627 1190386 4 6 6 5 7
## 628 1190546 2 1 1 1 2
## 629 1213273 2 1 1 1 2
## 630 1218982 4 1 1 1 2
## 631 1225382 6 2 3 1 2
## 632 1235807 5 1 1 1 2
## 633 1238777 1 1 1 1 2
## 634 1253955 8 7 4 4 5
## 635 1257366 3 1 1 1 2
## 636 1260659 3 1 4 1 2
## 637 1268952 10 10 7 8 7
## 638 1275807 4 2 4 3 2
## 639 1277792 4 1 1 1 2
## 640 1277792 5 1 1 3 2
## 641 1285722 4 1 1 3 2
## 642 1288608 3 1 1 1 2
## 643 1290203 3 1 1 1 2
## 644 1294413 1 1 1 1 2
## 645 1299596 2 1 1 1 2
## 646 1303489 3 1 1 1 2
## 647 1311033 1 2 2 1 2
## 648 1311108 1 1 1 3 2
## 649 1315807 5 10 10 10 10
## 650 1318671 3 1 1 1 2
## 651 1319609 3 1 1 2 3
## 652 1323477 1 2 1 3 2
## 653 1324572 5 1 1 1 2
## 654 1324681 4 1 1 1 2
## 655 1325159 3 1 1 1 2
## 656 1326892 3 1 1 1 2
## 657 1330361 5 1 1 1 2
## 658 1333877 5 4 5 1 8
## 659 1334015 7 8 8 7 3
## 660 1334667 1 1 1 1 2
## 661 1339781 1 1 1 1 2
## 662 1339781 4 1 1 1 2
## 663 13454352 1 1 3 1 2
## 664 1345452 1 1 3 1 2
## 665 1345593 3 1 1 3 2
## 666 1347749 1 1 1 1 2
## 667 1347943 5 2 2 2 2
## 668 1348851 3 1 1 1 2
## 669 1350319 5 7 4 1 6
## 670 1350423 5 10 10 8 5
## 671 1352848 3 10 7 8 5
## 672 1353092 3 2 1 2 2
## 673 1354840 2 1 1 1 2
## 674 1354840 5 3 2 1 3
## 675 1355260 1 1 1 1 2
## 676 1365075 4 1 4 1 2
## 677 1365328 1 1 2 1 2
## 678 1368267 5 1 1 1 2
## 679 1368273 1 1 1 1 2
## 680 1368882 2 1 1 1 2
## 681 1369821 10 10 10 10 5
## 682 1371026 5 10 10 10 4
## 683 1371920 5 1 1 1 2
## 684 466906 1 1 1 1 2
## 685 466906 1 1 1 1 2
## 686 534555 1 1 1 1 2
## 687 536708 1 1 1 1 2
## 688 566346 3 1 1 1 2
## 689 603148 4 1 1 1 2
## 690 654546 1 1 1 1 2
## 691 654546 1 1 1 3 2
## 692 695091 5 10 10 5 4
## 693 714039 3 1 1 1 2
## 694 763235 3 1 1 1 2
## 695 776715 3 1 1 1 3
## 696 841769 2 1 1 1 2
## 697 888820 5 10 10 3 7
## 698 897471 4 8 6 4 3
## 699 897471 4 8 8 5 4
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 2
## 2 10 3 2 1 2
## 3 2 3 1 1 2
## 4 4 3 7 1 2
## 5 1 3 1 1 2
## 6 10 9 7 1 4
## 7 10 3 1 1 2
## 8 1 3 1 1 2
## 9 1 1 1 5 2
## 10 1 2 1 1 2
## 11 1 3 1 1 2
## 12 1 2 1 1 2
## 13 3 4 4 1 4
## 14 3 3 1 1 2
## 15 9 5 5 4 4
## 16 1 4 3 1 4
## 17 1 2 1 1 2
## 18 1 3 1 1 2
## 19 10 4 1 2 4
## 20 1 3 1 1 2
## 21 10 5 4 4 4
## 22 7 7 10 1 4
## 23 1 2 1 1 2
## 24 ? 7 3 1 4
## 25 1 3 1 1 2
## 26 7 3 6 1 4
## 27 1 2 1 1 2
## 28 1 2 1 1 2
## 29 1 2 1 1 2
## 30 1 1 1 1 2
## 31 1 2 1 1 2
## 32 1 3 1 1 2
## 33 5 7 4 3 4
## 34 1 3 1 1 2
## 35 1 2 1 1 2
## 36 1 2 1 1 2
## 37 1 8 9 1 4
## 38 1 7 1 1 2
## 39 10 5 6 1 4
## 40 7 7 5 1 4
## 41 ? 7 8 1 2
## 42 3 6 5 2 4
## 43 10 7 3 3 4
## 44 1 3 1 1 4
## 45 1 8 10 1 4
## 46 1 2 1 2 2
## 47 9 4 8 1 4
## 48 1 2 1 1 2
## 49 1 3 1 1 2
## 50 8 3 8 2 4
## 51 3 2 1 5 4
## 52 4 3 4 1 4
## 53 5 4 10 2 4
## 54 8 7 3 7 4
## 55 8 7 1 1 4
## 56 5 3 6 1 4
## 57 6 3 9 1 4
## 58 1 5 4 4 4
## 59 10 5 1 1 4
## 60 2 5 1 1 4
## 61 3 4 10 1 4
## 62 2 2 1 1 2
## 63 8 3 3 1 4
## 64 2 3 9 1 4
## 65 1 2 1 1 2
## 66 2 4 3 10 4
## 67 1 3 1 1 2
## 68 10 4 9 1 4
## 69 9 8 9 8 4
## 70 1 3 2 1 2
## 71 1 2 1 1 2
## 72 2 7 8 10 4
## 73 1 7 2 1 2
## 74 10 4 8 1 4
## 75 4 3 2 3 4
## 76 2 4 2 1 2
## 77 1 2 1 1 2
## 78 1 2 1 1 2
## 79 3 3 1 1 2
## 80 1 2 1 1 2
## 81 1 7 1 1 2
## 82 1 2 1 1 2
## 83 1 3 1 1 2
## 84 2 7 1 1 2
## 85 9 7 10 7 4
## 86 4 4 10 10 4
## 87 8 4 4 1 4
## 88 10 6 8 3 4
## 89 1 3 1 1 2
## 90 1 2 1 1 2
## 91 1 3 1 1 2
## 92 1 1 1 1 2
## 93 1 3 1 1 2
## 94 1 2 1 1 2
## 95 1 3 1 1 2
## 96 1 3 1 1 2
## 97 1 1 1 1 2
## 98 1 3 1 1 2
## 99 6 2 9 10 4
## 100 10 7 9 4 4
## 101 5 3 10 2 4
## 102 5 2 5 1 4
## 103 1 3 1 1 2
## 104 3 7 1 1 4
## 105 1 8 8 8 4
## 106 3 3 2 7 4
## 107 10 4 1 1 4
## 108 10 5 7 1 4
## 109 1 2 3 1 2
## 110 9 7 8 3 4
## 111 2 5 3 2 2
## 112 9 3 1 1 4
## 113 10 7 3 3 4
## 114 8 8 1 1 4
## 115 3 3 1 1 2
## 116 5 1 1 1 2
## 117 2 3 2 1 2
## 118 10 7 5 8 4
## 119 3 1 1 1 2
## 120 2 3 1 1 2
## 121 1 3 1 1 2
## 122 2 3 1 1 2
## 123 10 5 3 3 4
## 124 10 5 3 1 4
## 125 7 8 10 1 4
## 126 1 2 1 1 2
## 127 10 7 5 5 4
## 128 1 3 1 1 2
## 129 10 1 6 2 4
## 130 1 1 1 1 2
## 131 1 2 1 1 2
## 132 1 3 1 1 2
## 133 10 3 6 3 4
## 134 1 2 2 1 2
## 135 1 2 1 1 2
## 136 2 3 3 1 2
## 137 1 2 1 1 2
## 138 1 1 1 1 2
## 139 1 2 1 1 2
## 140 ? 2 1 1 2
## 141 1 1 1 1 2
## 142 1 1 1 1 2
## 143 5 4 3 3 4
## 144 5 1 1 1 2
## 145 1 2 1 1 2
## 146 ? 2 1 1 2
## 147 8 4 1 1 4
## 148 2 2 1 1 2
## 149 1 5 8 1 2
## 150 10 7 8 7 4
## 151 1 3 1 1 2
## 152 10 5 4 3 4
## 153 5 8 10 1 4
## 154 3 1 1 1 2
## 155 1 1 1 1 2
## 156 10 3 1 1 4
## 157 1 2 1 1 2
## 158 1 3 1 1 2
## 159 ? 1 1 1 2
## 160 10 7 10 6 4
## 161 10 5 7 2 4
## 162 1 3 2 1 2
## 163 1 3 1 1 2
## 164 3 1 1 7 2
## 165 ? 3 1 1 2
## 166 2 3 2 1 2
## 167 10 3 10 3 4
## 168 1 3 1 10 4
## 169 1 3 1 1 2
## 170 1 1 1 1 2
## 171 1 1 1 1 2
## 172 1 3 1 1 2
## 173 1 2 1 1 2
## 174 10 10 10 7 4
## 175 10 6 1 1 4
## 176 10 5 7 1 4
## 177 1 3 1 1 2
## 178 1 5 10 3 4
## 179 1 3 1 1 2
## 180 10 3 1 1 4
## 181 1 3 1 1 2
## 182 1 1 1 1 2
## 183 1 3 1 1 2
## 184 10 7 8 1 4
## 185 10 5 1 1 4
## 186 1 3 1 1 2
## 187 8 7 10 1 4
## 188 10 7 7 10 4
## 189 8 9 10 1 4
## 190 1 3 1 1 2
## 191 8 7 10 1 4
## 192 10 4 10 3 4
## 193 1 2 1 1 2
## 194 1 3 1 1 2
## 195 1 3 1 1 2
## 196 1 3 1 1 2
## 197 7 7 8 2 2
## 198 1 3 1 1 2
## 199 1 1 1 1 2
## 200 1 2 1 1 2
## 201 10 7 8 3 4
## 202 10 8 1 1 4
## 203 1 3 1 1 2
## 204 1 3 1 1 2
## 205 1 3 1 1 2
## 206 10 7 10 5 4
## 207 5 3 5 1 4
## 208 1 3 1 1 2
## 209 1 3 1 1 2
## 210 1 3 1 1 2
## 211 10 8 10 6 4
## 212 8 7 7 1 4
## 213 1 3 1 1 2
## 214 10 7 10 4 4
## 215 10 10 6 1 4
## 216 5 5 10 2 4
## 217 1 2 1 1 2
## 218 1 3 1 1 2
## 219 4 8 10 2 4
## 220 1 3 1 1 2
## 221 1 3 1 1 2
## 222 10 9 10 1 4
## 223 5 2 1 1 4
## 224 8 7 4 1 4
## 225 10 7 9 2 4
## 226 1 2 1 1 2
## 227 10 8 9 1 4
## 228 5 7 7 1 4
## 229 1 3 1 1 2
## 230 10 9 10 1 4
## 231 7 7 6 1 4
## 232 8 8 9 2 4
## 233 1 4 3 1 2
## 234 10 4 1 1 4
## 235 1 3 6 1 2
## 236 ? 3 1 1 2
## 237 10 4 8 10 4
## 238 2 4 10 4 4
## 239 9 3 10 10 4
## 240 10 5 3 2 4
## 241 2 2 3 1 2
## 242 1 3 1 1 2
## 243 1 3 1 1 2
## 244 5 5 1 1 2
## 245 1 3 1 1 2
## 246 2 3 1 1 2
## 247 10 7 8 1 4
## 248 9 3 3 1 4
## 249 1 3 6 1 2
## 250 ? 3 1 1 2
## 251 1 1 1 1 2
## 252 10 5 3 3 4
## 253 10 3 5 3 2
## 254 10 7 3 3 4
## 255 8 3 3 1 4
## 256 10 3 6 1 4
## 257 1 1 1 1 2
## 258 1 2 1 1 2
## 259 1 3 1 1 2
## 260 8 3 4 1 2
## 261 10 5 1 3 4
## 262 10 10 6 5 4
## 263 10 7 8 1 4
## 264 10 5 5 1 4
## 265 3 5 3 3 4
## 266 1 3 2 1 2
## 267 10 4 3 2 4
## 268 10 7 1 1 4
## 269 4 8 7 8 4
## 270 1 3 1 1 2
## 271 10 3 9 2 4
## 272 1 3 1 1 2
## 273 10 7 1 1 4
## 274 4 3 3 1 4
## 275 1 3 2 1 2
## 276 ? 2 1 1 2
## 277 1 2 1 1 2
## 278 1 2 1 1 2
## 279 1 3 1 1 2
## 280 7 3 3 8 4
## 281 1 3 1 1 2
## 282 1 3 1 1 2
## 283 10 5 6 1 4
## 284 10 5 3 1 4
## 285 10 3 8 2 4
## 286 10 10 7 3 4
## 287 10 4 10 10 4
## 288 1 2 1 1 2
## 289 5 5 10 1 4
## 290 10 4 10 4 4
## 291 1 1 1 1 2
## 292 1 3 1 1 2
## 293 ? 6 10 1 4
## 294 10 2 3 1 4
## 295 ? 2 1 1 2
## 296 10 7 4 1 4
## 297 5 4 7 1 2
## 298 ? 2 3 1 2
## 299 1 1 1 1 2
## 300 10 7 7 2 4
## 301 4 7 10 1 4
## 302 1 3 1 1 2
## 303 10 7 10 10 4
## 304 1 3 1 1 2
## 305 10 3 3 1 4
## 306 10 3 10 4 4
## 307 1 3 1 1 2
## 308 1 3 1 1 2
## 309 3 8 8 4 4
## 310 5 5 1 1 2
## 311 1 2 1 1 2
## 312 1 1 1 1 2
## 313 1 3 5 1 4
## 314 1 1 1 1 2
## 315 1 2 1 1 2
## 316 ? 4 9 1 2
## 317 10 4 3 1 4
## 318 8 8 9 1 4
## 319 1 3 1 1 2
## 320 5 7 3 1 2
## 321 10 7 4 6 4
## 322 ? 3 1 1 2
## 323 1 3 1 1 2
## 324 10 4 1 1 4
## 325 1 3 1 1 2
## 326 1 2 3 1 2
## 327 10 5 4 1 4
## 328 1 2 1 1 2
## 329 4 3 10 1 4
## 330 10 7 1 1 4
## 331 8 6 1 1 4
## 332 1 3 1 2 2
## 333 1 2 2 1 2
## 334 10 4 3 1 4
## 335 10 3 4 2 4
## 336 1 1 1 1 2
## 337 10 3 4 1 4
## 338 1 3 1 1 2
## 339 1 2 1 1 2
## 340 10 4 3 1 4
## 341 10 7 6 1 4
## 342 1 3 1 1 2
## 343 1 1 1 1 2
## 344 1 1 1 1 2
## 345 10 9 5 3 4
## 346 1 1 1 1 2
## 347 1 1 3 1 2
## 348 1 1 3 1 2
## 349 1 3 3 1 4
## 350 8 7 6 1 4
## 351 1 1 1 1 2
## 352 1 3 1 1 2
## 353 3 4 6 1 2
## 354 10 4 9 4 4
## 355 1 2 1 1 2
## 356 1 2 2 1 2
## 357 3 3 3 3 4
## 358 10 7 3 8 4
## 359 4 4 10 3 4
## 360 7 3 5 3 4
## 361 10 8 10 10 4
## 362 10 5 1 4 4
## 363 3 2 1 1 2
## 364 3 2 1 1 2
## 365 1 3 1 1 2
## 366 1 2 1 1 2
## 367 10 7 10 7 4
## 368 10 8 10 3 4
## 369 1 1 1 1 2
## 370 1 2 1 1 2
## 371 1 2 1 1 2
## 372 1 1 1 1 2
## 373 1 2 1 1 2
## 374 1 2 1 1 2
## 375 1 2 1 1 2
## 376 1 1 1 1 2
## 377 1 2 1 1 2
## 378 1 2 1 1 2
## 379 1 2 2 1 2
## 380 1 3 1 1 2
## 381 1 1 1 1 2
## 382 10 7 8 4 4
## 383 1 3 2 1 2
## 384 1 1 1 1 2
## 385 1 1 1 1 2
## 386 1 1 2 3 2
## 387 10 7 1 1 4
## 388 1 3 1 1 2
## 389 1 2 2 1 2
## 390 2 2 2 1 2
## 391 1 2 1 1 2
## 392 10 7 9 1 4
## 393 1 2 1 1 2
## 394 1 1 1 1 2
## 395 1 2 1 1 2
## 396 1 2 1 1 2
## 397 1 3 1 1 2
## 398 1 1 1 1 2
## 399 1 2 2 1 2
## 400 1 1 1 1 2
## 401 9 9 3 8 4
## 402 1 1 1 1 2
## 403 1 2 1 1 2
## 404 4 1 1 1 2
## 405 1 1 2 1 2
## 406 1 2 1 1 2
## 407 1 2 1 1 2
## 408 1 2 1 1 2
## 409 2 3 1 1 2
## 410 1 2 1 1 2
## 411 1 2 1 1 2
## 412 ? 2 1 1 2
## 413 4 8 5 1 4
## 414 1 3 1 1 2
## 415 10 6 6 1 4
## 416 3 3 5 1 2
## 417 10 7 2 1 4
## 418 1 2 1 1 2
## 419 2 3 2 2 2
## 420 1 1 1 1 2
## 421 3 3 1 1 2
## 422 10 8 2 1 4
## 423 1 3 3 1 2
## 424 1 2 1 1 2
## 425 1 1 1 1 2
## 426 10 10 10 1 4
## 427 1 1 1 1 2
## 428 2 5 10 1 4
## 429 1 2 1 1 2
## 430 1 2 1 1 2
## 431 1 2 2 1 2
## 432 1 3 2 1 2
## 433 1 2 2 1 2
## 434 1 1 1 1 2
## 435 8 4 2 1 2
## 436 10 5 1 1 4
## 437 1 2 8 1 4
## 438 1 1 1 1 2
## 439 1 1 1 1 2
## 440 1 1 1 1 2
## 441 10 10 1 1 4
## 442 4 1 1 1 2
## 443 3 1 1 1 2
## 444 2 1 1 1 2
## 445 1 2 1 1 2
## 446 1 1 1 1 2
## 447 1 1 1 1 2
## 448 1 1 1 1 2
## 449 1 1 1 1 2
## 450 10 8 10 1 4
## 451 1 2 1 1 2
## 452 1 1 1 1 2
## 453 1 1 1 1 2
## 454 10 10 7 1 4
## 455 1 1 1 1 2
## 456 6 1 1 2 4
## 457 10 8 6 1 4
## 458 3 10 10 1 4
## 459 1 1 1 1 2
## 460 1 1 1 1 2
## 461 1 1 1 1 2
## 462 5 1 1 1 2
## 463 1 1 1 1 2
## 464 1 1 2 1 2
## 465 1 1 1 1 2
## 466 4 7 10 3 4
## 467 10 9 7 1 4
## 468 10 7 6 2 4
## 469 1 1 1 1 2
## 470 1 2 1 1 2
## 471 1 2 1 1 2
## 472 1 1 1 1 2
## 473 1 1 1 1 2
## 474 1 1 1 1 2
## 475 1 1 1 1 2
## 476 1 1 1 1 2
## 477 1 1 1 1 2
## 478 1 1 1 1 2
## 479 1 1 1 1 2
## 480 10 7 5 1 4
## 481 1 1 1 1 2
## 482 1 1 1 1 2
## 483 5 10 10 10 4
## 484 10 9 10 1 4
## 485 1 1 1 1 2
## 486 3 1 1 1 2
## 487 1 2 1 1 2
## 488 10 8 1 5 4
## 489 3 3 4 1 4
## 490 4 4 1 1 4
## 491 1 1 1 1 2
## 492 10 7 1 1 4
## 493 1 2 1 1 2
## 494 10 6 5 2 4
## 495 5 2 1 1 2
## 496 1 2 1 1 2
## 497 1 1 1 1 2
## 498 1 1 1 1 2
## 499 1 2 1 1 2
## 500 1 2 1 1 2
## 501 1 3 1 1 2
## 502 1 2 1 1 2
## 503 1 2 1 1 2
## 504 1 3 1 1 2
## 505 1 1 1 1 2
## 506 1 1 1 1 2
## 507 5 4 8 7 4
## 508 4 1 1 1 2
## 509 1 1 1 1 2
## 510 1 1 1 1 2
## 511 1 1 1 1 2
## 512 1 2 1 1 2
## 513 1 1 1 1 2
## 514 1 2 1 1 2
## 515 10 8 10 2 4
## 516 10 9 10 1 4
## 517 1 1 1 1 2
## 518 1 2 1 1 2
## 519 1 1 1 1 2
## 520 10 9 1 1 4
## 521 1 1 1 1 2
## 522 1 1 1 1 2
## 523 5 7 3 1 4
## 524 10 5 3 1 4
## 525 1 2 1 1 2
## 526 1 1 1 1 2
## 527 1 1 1 1 2
## 528 1 3 1 1 2
## 529 1 1 1 1 2
## 530 1 2 1 1 2
## 531 10 6 9 1 4
## 532 1 2 1 1 2
## 533 1 3 1 1 2
## 534 1 2 1 1 2
## 535 1 2 1 1 2
## 536 1 3 1 1 2
## 537 1 3 1 1 2
## 538 1 3 1 1 2
## 539 1 2 1 1 2
## 540 1 2 1 1 2
## 541 2 2 1 1 2
## 542 1 1 1 1 2
## 543 1 1 1 1 2
## 544 1 2 1 1 2
## 545 1 2 1 1 2
## 546 1 2 1 1 2
## 547 10 7 10 1 4
## 548 1 1 1 1 2
## 549 1 1 1 1 2
## 550 5 7 8 2 4
## 551 1 2 1 1 2
## 552 1 3 1 1 2
## 553 1 4 2 1 2
## 554 5 2 1 2 2
## 555 1 1 1 1 2
## 556 1 4 8 1 2
## 557 1 2 1 1 2
## 558 1 1 1 1 2
## 559 1 2 1 1 2
## 560 1 2 1 1 2
## 561 1 3 1 1 2
## 562 1 3 1 1 2
## 563 1 3 1 1 2
## 564 1 2 1 1 2
## 565 1 3 2 1 2
## 566 10 10 10 1 4
## 567 1 3 1 1 2
## 568 3 2 1 1 2
## 569 10 2 5 2 4
## 570 5 10 3 1 4
## 571 10 8 2 1 4
## 572 10 9 10 2 4
## 573 1 2 1 1 2
## 574 1 2 1 1 2
## 575 2 7 7 1 4
## 576 1 3 1 1 2
## 577 1 2 1 1 2
## 578 1 2 1 1 2
## 579 1 2 1 1 2
## 580 1 3 1 1 2
## 581 1 2 1 1 2
## 582 10 7 5 1 4
## 583 10 6 10 1 4
## 584 1 1 1 1 2
## 585 1 1 1 1 2
## 586 1 1 1 1 2
## 587 10 10 10 1 4
## 588 1 2 2 1 2
## 589 3 4 1 1 4
## 590 1 1 1 1 2
## 591 1 10 1 1 4
## 592 10 7 6 1 4
## 593 10 4 1 1 4
## 594 1 1 1 1 2
## 595 10 7 1 1 4
## 596 1 2 1 1 2
## 597 1 2 1 1 2
## 598 1 3 1 1 2
## 599 1 2 1 1 2
## 600 1 1 1 1 2
## 601 1 2 1 1 2
## 602 1 2 1 1 2
## 603 1 2 1 1 2
## 604 1 8 10 1 4
## 605 10 8 1 2 4
## 606 8 7 8 3 4
## 607 1 1 1 1 2
## 608 1 1 1 1 2
## 609 10 10 1 1 4
## 610 1 1 1 1 2
## 611 10 7 1 2 4
## 612 2 8 5 1 4
## 613 10 10 10 10 4
## 614 1 2 1 1 2
## 615 1 2 1 1 2
## 616 1 2 1 1 2
## 617 1 2 1 1 2
## 618 ? 1 1 1 2
## 619 1 2 1 1 2
## 620 1 2 1 1 2
## 621 1 2 1 1 2
## 622 2 6 1 1 2
## 623 1 2 1 1 2
## 624 1 1 1 1 2
## 625 1 2 1 1 2
## 626 4 1 1 1 2
## 627 6 7 7 3 4
## 628 5 1 1 1 2
## 629 1 1 1 1 2
## 630 1 1 1 1 2
## 631 1 1 1 1 2
## 632 1 2 1 1 2
## 633 1 1 1 1 2
## 634 3 5 10 1 4
## 635 1 1 1 1 2
## 636 1 1 1 1 2
## 637 1 10 10 3 4
## 638 2 2 1 1 2
## 639 1 1 1 1 2
## 640 1 1 1 1 2
## 641 1 1 1 1 2
## 642 1 2 1 1 2
## 643 1 2 1 1 2
## 644 1 1 1 1 2
## 645 1 1 1 1 2
## 646 1 2 1 1 2
## 647 1 1 1 1 2
## 648 1 1 1 1 2
## 649 2 10 10 10 4
## 650 1 2 1 1 2
## 651 4 1 1 1 2
## 652 1 2 1 1 2
## 653 1 2 2 1 2
## 654 1 2 1 1 2
## 655 1 3 1 1 2
## 656 1 2 1 1 2
## 657 1 2 1 1 2
## 658 1 3 6 1 2
## 659 10 7 2 3 4
## 660 1 1 1 1 2
## 661 1 2 1 1 2
## 662 1 3 1 1 2
## 663 1 2 1 1 2
## 664 1 2 1 1 2
## 665 1 2 1 1 2
## 666 1 1 1 1 2
## 667 1 1 1 2 2
## 668 1 3 1 1 2
## 669 1 7 10 3 4
## 670 5 7 10 1 4
## 671 8 7 4 1 4
## 672 1 3 1 1 2
## 673 1 3 1 1 2
## 674 1 1 1 1 2
## 675 1 2 1 1 2
## 676 1 1 1 1 2
## 677 1 2 1 1 2
## 678 1 1 1 1 2
## 679 1 1 1 1 2
## 680 1 1 1 1 2
## 681 10 10 10 7 4
## 682 10 5 6 3 4
## 683 1 3 2 1 2
## 684 1 1 1 1 2
## 685 1 1 1 1 2
## 686 1 1 1 1 2
## 687 1 1 1 1 2
## 688 1 2 3 1 2
## 689 1 1 1 1 2
## 690 1 1 1 8 2
## 691 1 1 1 1 2
## 692 5 4 4 1 4
## 693 1 1 1 1 2
## 694 1 2 1 2 2
## 695 2 1 1 1 2
## 696 1 1 1 1 2
## 697 3 8 10 2 4
## 698 4 10 6 1 4
## 699 5 10 4 1 4
raw_breast_cancer$Class %>%
{ dict <- c("2" = "benign", "4" = "malignant")
dict[as.character(.)]
} %>%
unname %>%
factor(levels = c("benign", "malignant")) %>%
table
## .
## benign malignant
## 458 241
### Saving a file
formatted_breast_cancer %>%
save(file = "C:/Users/HP/Desktop/Data Science/Learning R Language/formatted-breast-cancer.rda")
### loading a saved file
load("C:/Users/HP/Desktop/Data Science/Learning R Language/formatted-breast-cancer.rda")
#data(DNA)
#str(DNA)
### Boston Housing Datasets
data(BostonHousing)
str(BostonHousing)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
boston <- "http://tinyurl.com/zq2u8vx"
#data(BostonHousing2)
#str(BostonHousing2)
#boston_housing <- read.table(boston)
#str(boston_housing)
col_classes <- rep("numeric", length(BostonHousing))
col_classes[which("chas" == names(BostonHousing))] <- "factor"
#boston_housing <- read.table(boston,col.names = names(BostonHousing),colClasses = col_classes)
#str(boston_housing)
### Readr Package
library(readr)
raw_breast_cancer <- read_csv("C:/Users/HP/Desktop/Data Science/Machine learning/breast+cancer+wisconsin+original/breast-cancer-wisconsin.data",show_col_types = "TRUE", col_names = names(BreastCancer))
raw_breast_cancer %>% head()
## # A tibble: 6 × 11
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## # ℹ 5 more variables: Bare.nuclei <chr>, Bl.cromatin <dbl>,
## # Normal.nucleoli <dbl>, Mitoses <dbl>, Class <dbl>
raw_breast_cancer <- read_csv("C:/Users/HP/Desktop/Data Science/Machine learning/breast+cancer+wisconsin+original/breast-cancer-wisconsin.data",
col_names = names(BreastCancer), show_col_types = "TRUE")
raw_breast_cancer %>% head()
## # A tibble: 6 × 11
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## # ℹ 5 more variables: Bare.nuclei <chr>, Bl.cromatin <dbl>,
## # Normal.nucleoli <dbl>, Mitoses <dbl>, Class <dbl>
### Manipulating Data with dplyr
library(tidyverse)
iris %>% tibble::as_tibble()
## # A tibble: 150 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # ℹ 140 more rows
iris %>% tbl_df %>% select(Species,Petal.Width) %>% head()
## # A tibble: 6 × 2
## Species Petal.Width
## <fct> <dbl>
## 1 setosa 0.2
## 2 setosa 0.2
## 3 setosa 0.2
## 4 setosa 0.2
## 5 setosa 0.2
## 6 setosa 0.4
iris %>% tbl_df %>% select(Species,Sepal.Width) %>% head()
## # A tibble: 6 × 2
## Species Sepal.Width
## <fct> <dbl>
## 1 setosa 3.5
## 2 setosa 3
## 3 setosa 3.2
## 4 setosa 3.1
## 5 setosa 3.6
## 6 setosa 3.9
iris %>% tbl_df %>% select(Species,Petal.Length) %>% head()
## # A tibble: 6 × 2
## Species Petal.Length
## <fct> <dbl>
## 1 setosa 1.4
## 2 setosa 1.4
## 3 setosa 1.3
## 4 setosa 1.5
## 5 setosa 1.4
## 6 setosa 1.7
iris %>% tbl_df %>% select(Species,Sepal.Length) %>% head()
## # A tibble: 6 × 2
## Species Sepal.Length
## <fct> <dbl>
## 1 setosa 5.1
## 2 setosa 4.9
## 3 setosa 4.7
## 4 setosa 4.6
## 5 setosa 5
## 6 setosa 5.4
iris %>% tbl_df %>%
select(Sepal.Width, Petal.Width) %>% head() ## Select from Sepal.Width and Petal.Width
## # A tibble: 6 × 2
## Sepal.Width Petal.Width
## <dbl> <dbl>
## 1 3.5 0.2
## 2 3 0.2
## 3 3.2 0.2
## 4 3.1 0.2
## 5 3.6 0.2
## 6 3.9 0.4
iris %>% tbl_df %>% select(Sepal.Length:Petal.Length) %>% head() ## Select from Sepal.Width to Petal.length
## # A tibble: 6 × 3
## Sepal.Length Sepal.Width Petal.Length
## <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4
## 2 4.9 3 1.4
## 3 4.7 3.2 1.3
## 4 4.6 3.1 1.5
## 5 5 3.6 1.4
## 6 5.4 3.9 1.7
iris %>% tbl_df %>% select(starts_with("Petal")) %>% head() ## Select Petal.Length and Width only
## # A tibble: 6 × 2
## Petal.Length Petal.Width
## <dbl> <dbl>
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
iris %>% tbl_df %>%
select(ends_with("Length")) %>% head()
## # A tibble: 6 × 2
## Sepal.Length Petal.Length
## <dbl> <dbl>
## 1 5.1 1.4
## 2 4.9 1.4
## 3 4.7 1.3
## 4 4.6 1.5
## 5 5 1.4
## 6 5.4 1.7
iris %>% tbl_df %>%
select(contains("etal")) %>% head()
## # A tibble: 6 × 2
## Petal.Length Petal.Width
## <dbl> <dbl>
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
iris %>% tbl_df %>% select(contains("al")) %>% head()
## # A tibble: 6 × 4
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
iris %>% tbl_df %>%
select(matches(".t.")) %>% head()
## # A tibble: 6 × 4
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
iris %>% tbl_df %>%
select(-starts_with("Petal")) %>% head() ### inverse selection
## # A tibble: 6 × 3
## Sepal.Length Sepal.Width Species
## <dbl> <dbl> <fct>
## 1 5.1 3.5 setosa
## 2 4.9 3 setosa
## 3 4.7 3.2 setosa
## 4 4.6 3.1 setosa
## 5 5 3.6 setosa
## 6 5.4 3.9 setosa
iris %>% tbl_df %>%
mutate(Petal.Width.plus.Length = Petal.Width + Petal.Length) %>%select(Species, Petal.Width.plus.Length) %>%head()
## # A tibble: 6 × 2
## Species Petal.Width.plus.Length
## <fct> <dbl>
## 1 setosa 1.6
## 2 setosa 1.6
## 3 setosa 1.5
## 4 setosa 1.7
## 5 setosa 1.6
## 6 setosa 2.1
iris %>% tbl_df %>% mutate(Sepal.Width.plus.Length = Sepal.Width + Sepal.Length) %>%select(Species, Sepal.Width.plus.Length) %>%head()
## # A tibble: 6 × 2
## Species Sepal.Width.plus.Length
## <fct> <dbl>
## 1 setosa 8.6
## 2 setosa 7.9
## 3 setosa 7.9
## 4 setosa 7.7
## 5 setosa 8.6
## 6 setosa 9.3
iris %>% tbl_df %>%mutate(Petal.Width.plus.Length = Petal.Width + Petal.Length,Sepal.Width.plus.Length = Sepal.Width + Sepal.Length) %>%select(Species,Petal.Width.plus.Length, Sepal.Width.plus.Length) %>% head()
## # A tibble: 6 × 3
## Species Petal.Width.plus.Length Sepal.Width.plus.Length
## <fct> <dbl> <dbl>
## 1 setosa 1.6 8.6
## 2 setosa 1.6 7.9
## 3 setosa 1.5 7.9
## 4 setosa 1.7 7.7
## 5 setosa 1.6 8.6
## 6 setosa 2.1 9.3
iris %>% tbl_df %>%transmute(Species ,Petal.Width.plus.Length = Petal.Width + Petal.Length) %>%
head()
## # A tibble: 6 × 2
## Species Petal.Width.plus.Length
## <fct> <dbl>
## 1 setosa 1.6
## 2 setosa 1.6
## 3 setosa 1.5
## 4 setosa 1.7
## 5 setosa 1.6
## 6 setosa 2.1
iris %>% tbl_df %>% arrange(Sepal.Length) %>% head()
## # A tibble: 6 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 4.3 3 1.1 0.1 setosa
## 2 4.4 2.9 1.4 0.2 setosa
## 3 4.4 3 1.3 0.2 setosa
## 4 4.4 3.2 1.3 0.2 setosa
## 5 4.5 2.3 1.3 0.3 setosa
## 6 4.6 3.1 1.5 0.2 setosa
#from smallest to largest in Sepal.Length column
iris %>% tbl_df %>% arrange(Petal.Width) %>% head()
## # A tibble: 6 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 4.9 3.1 1.5 0.1 setosa
## 2 4.8 3 1.4 0.1 setosa
## 3 4.3 3 1.1 0.1 setosa
## 4 5.2 4.1 1.5 0.1 setosa
## 5 4.9 3.6 1.4 0.1 setosa
## 6 5.1 3.5 1.4 0.2 setosa
#from smallest to largest in Sepal.Length column
iris %>% tbl_df %>%
arrange(desc(Sepal.Length)) %>%
head() ### from largest to smallest in Sepal.Length column
## # A tibble: 6 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 7.9 3.8 6.4 2 virginica
## 2 7.7 3.8 6.7 2.2 virginica
## 3 7.7 2.6 6.9 2.3 virginica
## 4 7.7 2.8 6.7 2 virginica
## 5 7.7 3 6.1 2.3 virginica
## 6 7.6 3 6.6 2.1 virginica
iris %>% tbl_df %>%
filter(Sepal.Length >= 5) %>%
head()
## # A tibble: 6 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 5 3.6 1.4 0.2 setosa
## 3 5.4 3.9 1.7 0.4 setosa
## 4 5 3.4 1.5 0.2 setosa
## 5 5.4 3.7 1.5 0.2 setosa
## 6 5.8 4 1.2 0.2 setosa
iris %>% tbl_df %>%
filter(Petal.Length <= 1.4) %>%
head()
## # A tibble: 6 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 5 3.6 1.4 0.2 setosa
## 5 4.6 3.4 1.4 0.3 setosa
## 6 4.4 2.9 1.4 0.2 setosa
iris %>% tbl_df %>%filter(Sepal.Length > 5 & Species == "virginica") %>%select(Species, Sepal.Length) %>%head()
## # A tibble: 6 × 2
## Species Sepal.Length
## <fct> <dbl>
## 1 virginica 6.3
## 2 virginica 5.8
## 3 virginica 7.1
## 4 virginica 6.3
## 5 virginica 6.5
## 6 virginica 7.6
iris %>% tbl_df %>% group_by(Species) %>% head()
## # A tibble: 6 × 5
## # Groups: Species [1]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
iris %>% summarise(Mean.Petal.Length = mean(Petal.Length),
Mean.Sepal.Length = mean(Sepal.Length))
## Mean.Petal.Length Mean.Sepal.Length
## 1 3.758 5.843333
iris %>% summarise(Std.Petal.Length = sd(Petal.Length), Std.Sepal.Length = sd(Sepal.Length))
## Std.Petal.Length Std.Sepal.Length
## 1 1.765298 0.8280661
iris %>% summarise(Mean.Petal.Width = mean(Petal.Width),
Mean.Sepal.Width = mean(Sepal.Width))
## Mean.Petal.Width Mean.Sepal.Width
## 1 1.199333 3.057333
iris %>% summarise(Var.Petal.Width = var(Petal.Width), Var.Seapl.Width= var(Sepal.Width))
## Var.Petal.Width Var.Seapl.Width
## 1 0.5810063 0.1899794
iris %>%group_by(Species) %>%summarise(Mean.Petal.Length = mean(Petal.Length))
## # A tibble: 3 × 2
## Species Mean.Petal.Length
## <fct> <dbl>
## 1 setosa 1.46
## 2 versicolor 4.26
## 3 virginica 5.55
iris %>% group_by(Species) %>% summarise(Var.Petal.Length = var(Petal.Length))
## # A tibble: 3 × 2
## Species Var.Petal.Length
## <fct> <dbl>
## 1 setosa 0.0302
## 2 versicolor 0.221
## 3 virginica 0.305
iris %>% group_by(Species) %>% summarise(Mean.Sepal.Length = mean(Sepal.Length))
## # A tibble: 3 × 2
## Species Mean.Sepal.Length
## <fct> <dbl>
## 1 setosa 5.01
## 2 versicolor 5.94
## 3 virginica 6.59
iris %>% group_by(Species)%>%
summarize(Median.Petal.Width = median(Petal.Width))
## # A tibble: 3 × 2
## Species Median.Petal.Width
## <fct> <dbl>
## 1 setosa 0.2
## 2 versicolor 1.3
## 3 virginica 2
iris %>% group_by(Species)%>% summarize(Median.Sepal.Width = median(Sepal.Width))
## # A tibble: 3 × 2
## Species Median.Sepal.Width
## <fct> <dbl>
## 1 setosa 3.4
## 2 versicolor 2.8
## 3 virginica 3
iris %>% summarise(Observations = n())
## Observations
## 1 150
iris %>%
group_by(Species) %>%
summarise(Number.Of.Species = n())
## # A tibble: 3 × 2
## Species Number.Of.Species
## <fct> <int>
## 1 setosa 50
## 2 versicolor 50
## 3 virginica 50
iris %>%group_by(Species) %>%summarise(Number.Of.Samples = n(), Mean.Petal.Length = mean(Petal.Length))
## # A tibble: 3 × 3
## Species Number.Of.Samples Mean.Petal.Length
## <fct> <int> <dbl>
## 1 setosa 50 1.46
## 2 versicolor 50 4.26
## 3 virginica 50 5.55
iris %>%
group_by(Species) %>%
summarise(Number.Of.Samples = n(),
Mean.Sepal.Length = mean(Sepal.Length))
## # A tibble: 3 × 3
## Species Number.Of.Samples Mean.Sepal.Length
## <fct> <int> <dbl>
## 1 setosa 50 5.01
## 2 versicolor 50 5.94
## 3 virginica 50 6.59
format_class <- . %>% {
dict <- c("2" = "benign", "4" = "malignant")
dict[as.character(.)]
} %>% unname %>% factor(levels = c("benign", "malignant"))
formatted_breast_cancer <-
raw_breast_cancer %>% mutate(Class = format_class(Class))
formatted_breast_cancer %>%
group_by(Class) %>%
summarise(mean.Cl.thickness = mean(Cl.thickness))
## # A tibble: 2 × 2
## Class mean.Cl.thickness
## <fct> <dbl>
## 1 benign 2.96
## 2 malignant 7.20
formatted_breast_cancer %>%
group_by(Class) %>%
summarise(mean.Cell.size = mean(Cell.size))
## # A tibble: 2 × 2
## Class mean.Cell.size
## <fct> <dbl>
## 1 benign 1.33
## 2 malignant 6.57
formatted_breast_cancer %>%
arrange(Cell.size) %>%
group_by(Cell.size, Class) %>%
summarise(ClassCount = n())
## # A tibble: 18 × 3
## # Groups: Cell.size [10]
## Cell.size Class ClassCount
## <dbl> <fct> <int>
## 1 1 benign 380
## 2 1 malignant 4
## 3 2 benign 37
## 4 2 malignant 8
## 5 3 benign 27
## 6 3 malignant 25
## 7 4 benign 9
## 8 4 malignant 31
## 9 5 malignant 30
## 10 6 benign 2
## 11 6 malignant 25
## 12 7 benign 1
## 13 7 malignant 18
## 14 8 benign 1
## 15 8 malignant 28
## 16 9 benign 1
## 17 9 malignant 5
## 18 10 malignant 67
formatted_breast_cancer %>%
group_by(Class, as.factor(Cell.size)) %>%
summarise(mean.Cl.thickness = mean(Cl.thickness))
## # A tibble: 18 × 3
## # Groups: Class [2]
## Class `as.factor(Cell.size)` mean.Cl.thickness
## <fct> <fct> <dbl>
## 1 benign 1 2.76
## 2 benign 2 3.49
## 3 benign 3 3.81
## 4 benign 4 5.11
## 5 benign 6 5
## 6 benign 7 5
## 7 benign 8 6
## 8 benign 9 6
## 9 malignant 1 7.25
## 10 malignant 2 6.75
## 11 malignant 3 6.44
## 12 malignant 4 7.71
## 13 malignant 5 6.87
## 14 malignant 6 6.88
## 15 malignant 7 6.89
## 16 malignant 8 7.18
## 17 malignant 9 8.8
## 18 malignant 10 7.52
#formatted_breast_cancer %>%group_by(as.factor(Cell.size), Class) %>% summarise(mean.thickness = mean(Cl.thickness))
iris %>% select(Species, Petal.Length) %>% head()
## Species Petal.Length
## 1 setosa 1.4
## 2 setosa 1.4
## 3 setosa 1.3
## 4 setosa 1.5
## 5 setosa 1.4
## 6 setosa 1.7
iris %>% select(Species, Petal.Length) %>%
qplot(Species, Petal.Length, geom = "boxplot", data = .)
#library(tidyr)
iris %>%gather(key = Attribute, value = Measurement,Sepal.Length, Sepal.Width) %>%select(Species, Attribute, Measurement) %>% head()
## Species Attribute Measurement
## 1 setosa Sepal.Length 5.1
## 2 setosa Sepal.Length 4.9
## 3 setosa Sepal.Length 4.7
## 4 setosa Sepal.Length 4.6
## 5 setosa Sepal.Length 5.0
## 6 setosa Sepal.Length 5.4
iris %>%
gather(key = Attribute, value = Measurement,
Sepal.Length, Sepal.Width) %>% select(Species, Attribute, Measurement) %>% qplot(Attribute, Measurement, geom = "boxplot", facets = . ~ Species, data = .)
iris %>%
gather(key = Attribute, value = Measurement,
Petal.Length, Petal.Width) %>%
select(Species, Attribute, Measurement) %>%
head()
## Species Attribute Measurement
## 1 setosa Petal.Length 1.4
## 2 setosa Petal.Length 1.4
## 3 setosa Petal.Length 1.3
## 4 setosa Petal.Length 1.5
## 5 setosa Petal.Length 1.4
## 6 setosa Petal.Length 1.7
iris %>%
gather(key = Attribute, value = Measurement,
Petal.Length, Petal.Width) %>%
select(Species, Attribute, Measurement) %>%
qplot(Attribute, Measurement,
geom = "boxplot",
facets = . ~ Species, data = .)
## **Chapter 4**
### Visualizing Data
library(graphics)
x = rnorm(100)
y = rnorm(100)
plot(x,y, type="p", main ="X against Y", col = "red")
data(cars)
cars %$% plot(speed, dist, main = "Cars data",
xlab = "Speed", ylab = "Stopping distance")
cars %>% plot(dist ~ speed, data = ., main ="Cars data", col = "red")
cars %$% plot(speed, dist, main = "Cars data", type = "h",
xlab = "Speed", ylab = "Stopping Distance")
cars %$% plot(speed, dist, main = "Cars data", type = "s",
xlab = "Speed", ylab = "Stopping Distance")
cars %$% hist(speed, col = "orange")
cars %$% hist(dist, col= "blue")
cars %>% lm(dist ~ speed, data = .) %>% plot ### Plotting Linear Models
data = longley
data
## GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
## 1947 83.0 234.289 235.6 159.0 107.608 1947 60.323
## 1948 88.5 259.426 232.5 145.6 108.632 1948 61.122
## 1949 88.2 258.054 368.2 161.6 109.773 1949 60.171
## 1950 89.5 284.599 335.1 165.0 110.929 1950 61.187
## 1951 96.2 328.975 209.9 309.9 112.075 1951 63.221
## 1952 98.1 346.999 193.2 359.4 113.270 1952 63.639
## 1953 99.0 365.385 187.0 354.7 115.094 1953 64.989
## 1954 100.0 363.112 357.8 335.0 116.219 1954 63.761
## 1955 101.2 397.469 290.4 304.8 117.388 1955 66.019
## 1956 104.6 419.180 282.2 285.7 118.734 1956 67.857
## 1957 108.4 442.769 293.6 279.8 120.445 1957 68.169
## 1958 110.8 444.546 468.1 263.7 121.950 1958 66.513
## 1959 112.6 482.704 381.3 255.2 123.366 1959 68.655
## 1960 114.2 502.601 393.1 251.4 125.368 1960 69.564
## 1961 115.7 518.173 480.6 257.2 127.852 1961 69.331
## 1962 116.9 554.894 400.7 282.7 130.081 1962 70.551
longley %>% plot(Unemployed ~ Year, data = ., type = 'l')
longley %>% lines(Armed.Forces ~ Year, data = ., col = "blue")
longley %$% plot(Unemployed ~ Year, type = 'l',
ylim = range(c(Unemployed, Armed.Forces)))
longley %>% lines(Armed.Forces ~ Year, data = ., col = "green")
as_tibble(longley)
## # A tibble: 16 × 7
## GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 83 234. 236. 159 108. 1947 60.3
## 2 88.5 259. 232. 146. 109. 1948 61.1
## 3 88.2 258. 368. 162. 110. 1949 60.2
## 4 89.5 285. 335. 165 111. 1950 61.2
## 5 96.2 329. 210. 310. 112. 1951 63.2
## 6 98.1 347. 193. 359. 113. 1952 63.6
## 7 99 365. 187 355. 115. 1953 65.0
## 8 100 363. 358. 335 116. 1954 63.8
## 9 101. 397. 290. 305. 117. 1955 66.0
## 10 105. 419. 282. 286. 119. 1956 67.9
## 11 108. 443. 294. 280. 120. 1957 68.2
## 12 111. 445. 468. 264. 122. 1958 66.5
## 13 113. 483. 381. 255. 123. 1959 68.7
## 14 114. 503. 393. 251. 125. 1960 69.6
## 15 116. 518. 481. 257. 128. 1961 69.3
## 16 117. 555. 401. 283. 130. 1962 70.6
cars %>% plot(dist ~ speed, data = ., xlab="Speed", ylab="Stopping Distance")
cars %>% lm(dist ~ speed, data = .) %>% abline(col = "red")
shape_map <- c("setosa" = 1, "versicolor" = 2, "virginica" = 3)
iris %$% plot(Petal.Length ~ Petal.Width, pch = shape_map[Species], col = Species)
iris %$% plot(Sepal.Length ~ Sepal.Width, pch = shape_map[Species], col = Species)
#cars %>% qplot(speed, dist, data = .)
iris %>% qplot(Petal.Width, Petal.Length, color = Species, data = .)
iris %>% qplot(Sepal.Width, Sepal.Length, color = Species, data = .)
cars %>% qplot(speed, data = ., bins = 10)
### Geometries
cars %>% qplot(speed, data = ., geom = "density")
cars %>% qplot (dist, data = ., geom = "density")
#cars %>% qplot(speed, dist, data = .)
ggplot(cars) + geom_point(aes(x = speed, y = dist))
#ggplot(cars, aes(x = speed, y = dist)) + geom_point()
#cars %>% ggplot(aes(x = speed, y = dist)) + geom_point()
iris %>% ggplot + geom_point(aes(x = Petal.Width, y = Petal.Length, color = Species))
iris %>% ggplot +
geom_point(aes(x = Sepal.Width, y = Sepal.Length,
color = Species))
iris %>% ggplot +
geom_point(aes(x = Petal.Width, y = Petal.Length),
color = "black")
cars %>% ggplot + geom_histogram(aes(x = speed), bins = 10)
cars %>% ggplot + geom_density(aes(x = speed))
cars %>% ggplot + geom_histogram(aes(x=dist), bins = 10)
cars %>% ggplot + geom_density(aes(x=dist))
cars %>% ggplot(aes(x = speed, y = after_stat(count))) +
geom_histogram(bins = 10) + geom_density()
cars %>% ggplot(aes(x = dist, y = after_stat(count))) + geom_histogram(bins = 10) + geom_density()
cars %>% ggplot(aes(x = speed, y = dist)) +
geom_point() + geom_smooth(method = "lm", formula = y ~ x)
cars %>% ggplot(aes(x = speed, y = dist)) +
geom_point() + geom_smooth( method = "loess",formula = y ~ x)
longley %>% ggplot(aes(x = Year)) + geom_line(aes(y = Unemployed)) + geom_line(aes(y = Armed.Forces), color = "blue")
longley %>% ggplot(aes(x = Year)) + geom_point(aes(y = Unemployed)) + geom_point(aes(y = Armed.Forces), color = "blue") + geom_line(aes(y = Unemployed)) + geom_line(aes(y = Armed.Forces), color = "blue")
longley %>% gather(key, value, Unemployed, Armed.Forces) %>%
ggplot(aes(x = Year, y = value, color = key)) + geom_line() + geom_point()
longley %>% gather(key, value, Unemployed, Armed.Forces) %>%
ggplot(aes(x = Year, y = value)) + geom_line() +
facet_grid(key ~ .) #### Facet row-wise with lines
longley %>% gather(key, value, Unemployed, Armed.Forces) %>%
ggplot(aes(x = Year, y = value)) + geom_point() +
facet_grid(key ~ .) #### Facet row-wise with points
longley %>% gather(key, value, Unemployed, Armed.Forces) %>%
ggplot(aes(x = Year, y = value)) + geom_line() +
facet_grid(. ~ key ) #### Facet column-wise with lines
longley %>% gather(key, value, Unemployed, Armed.Forces) %>%
ggplot(aes(x = Year, y = value)) + geom_point() +
facet_grid(. ~ key) #### Facet column-wise with points
iris %>% gather(Measurement, Value, -Species) %>%
ggplot(aes(x = Species, y = Value)) + geom_boxplot() +
facet_grid(Measurement ~ .) # row-wise
iris %>% gather(Measurement, Value, -Species) %>%
ggplot(aes(x = Species, y = Value)) + geom_boxplot() +
facet_grid(. ~ Measurement ) # Column wise
iris %>% gather(Measurement, Value, -Species) %>%
ggplot(aes(x = Species, y = Value)) +geom_boxplot() +
facet_grid(Measurement ~ ., scale = "free_y")
label_map <- c(Petal.Width = "Petal Width",
Petal.Length = "Petal Length",
Sepal.Width = "Sepal Width",
Sepal.Length = "Sepal Length")
iris %>% gather(Measurement, Value, -Species) %>%
ggplot(aes(x = Species, y = Value)) +
geom_boxplot() + facet_grid(Measurement ~ ., scale = "free_y", labeller = labeller(Measurement = label_map))
### Scaling
cars %>% ggplot(aes(x = speed, y = dist)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + scale_x_continuous("Speed") + scale_y_continuous("Stopping Distance")
longley %>% gather(key, value, Unemployed, Armed.Forces) %>%
ggplot(aes(x = Year, y = value)) + geom_line() + geom_point() + scale_x_continuous(breaks = 1947:1962) +
facet_grid(key ~ .)
cars %>% ggplot(aes(x = speed, y = dist)) +
geom_point() + geom_smooth(method = "lm", formula = y ~ x) + scale_x_reverse("Speed") + scale_y_continuous("Stopping Distance")
cars %>% ggplot(aes(x = speed, y = dist)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + scale_x_continuous("speed") + scale_y_reverse("Stopping Distance")
iris %>% ggplot(aes(x = Species, y = Petal.Length)) +
geom_boxplot() + geom_jitter(width = 0.1, height = 0.1)
iris %>% ggplot(aes(x = Species, y = Petal.Length)) +
geom_boxplot() + geom_jitter(width = 0.1, height = 0.1) +
scale_x_discrete(labels = c("setosa" = "Setosa",
"versicolor" = "Versicolor", "virginica" = "Virginica"))
iris %>% ggplot(aes(x = Species, y = Sepal.Length)) +
geom_boxplot() + geom_jitter(width = 0.001, height = 0.001) + scale_x_discrete(labels = c("setosa" = "Setosa",
"versicolor" = "Versicolor", "virginica" = "Virginica"))
#??geom_jitter
iris %>% gather(Measurement, Value, -Species) %>%
ggplot(aes(x = Species, y = Value, fill = Species)) +
geom_boxplot() + facet_grid(Measurement ~ ., scale = "free_y", labeller = labeller(Measurement = label_map))
iris %>% gather(Measurement, Value, -Species) %>%
ggplot(aes(x = Species, y = Value, fill = Species)) +
geom_boxplot() + scale_fill_manual(values = c("red", "green", "blue")) + facet_grid(Measurement ~ ., scale = "free_y", labeller = labeller(Measurement = label_map))
iris %>% gather(Measurement, Value, -Species) %>%
ggplot(aes(x = Species, y = Value, fill = Species)) +
geom_boxplot() + scale_fill_brewer(palette = "Greens") +
facet_grid(Measurement ~ ., scale = "free_y", labeller = labeller(Measurement = label_map))
#theme_set(theme_bw(2))
#old <- theme_set(theme_bw())
#theme_set(old)
#theme_replace(panel.grid.minor = element_line(colour = "red"))
iris %>% gather(Measurement, Value, -Species) %>%
ggplot(aes(x = Species, y = Value, fill = Species)) +
geom_boxplot() + scale_x_discrete(labels = c("setosa" = "Setosa", "versicolor" = "Versicolor", "virginica" = "Virginica")) + scale_fill_brewer(palette = "Reds") +
facet_grid(Measurement ~ ., switch = "y", labeller = labeller(Measurement = label_map)) + coord_flip()
iris %>% gather(Measurement, Value, -Species) %>%
ggplot(aes(x = Species, y = Value, fill = Species)) +
geom_boxplot() + scale_x_discrete(labels = c("setosa" = "Setosa", "versicolor" = "Versicolor", "virginica" = "Virginica")) + scale_fill_brewer(palette = "Blues") +
facet_grid(Measurement ~ ., switch = "y", labeller = labeller(Measurement = label_map)) + coord_flip()
iris %>% gather(Measurement, Value, -Species) %>%
ggplot(aes(x = Species, y = Value, fill = Species)) +
geom_boxplot() + scale_x_discrete(labels = c("setosa" = "Setosa", "versicolor" = "Versicolor", "virginica" = "Virginica")) + scale_fill_brewer(palette = "Greens") +
facet_grid(Measurement ~ ., switch = "y", labeller = labeller(Measurement = label_map)) + coord_flip() + theme(strip.background = element_blank()) + theme(legend.position ="top")
label_map <- c(Petal.Width = "Petal Width",Petal.Length = "Petal Length",Sepal.Width = "Sepal Width",Sepal.Length = "Sepal Length")
species_map <- c(setosa = "Setosa",versicolor = "Versicolor",virginica = "Virginica")
iris %>% gather(Measurement, Value, -Species) %>%
ggplot(aes(x = Species, y = Value, fill = Species)) +
geom_boxplot() + scale_x_discrete(labels = species_map) +
scale_fill_brewer(palette = "Greens", labels = species_map) + facet_grid(Measurement ~ ., switch = "y",
labeller = labeller(Measurement = label_map)) +
coord_flip() + theme(strip.background = element_blank()) +
theme(legend.position="bottom")
### Figures with Multiple Plots
petal <- iris %>% ggplot() +
geom_point(aes(x = Petal.Width, y = Petal.Length,
color = Species)) + theme(legend.position = "none")
sepal <- iris %>% ggplot() +
geom_point(aes(x = Sepal.Width, y = Sepal.Length,
color = Species)) + theme(legend.position = "none")
library(gridExtra)
grid.arrange(petal, sepal, nrow = 2, ncol = 1)
grid.arrange(petal, sepal, nrow = 1, ncol= 2)
library(cowplot)
plot_grid(petal, sepal, labels = c("A", "B"))
## **Chapter 5**
### Working With Larges Data sets
iris %>% sample_n(size = 10)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.3 3.7 1.5 0.2 setosa
## 2 4.4 3.0 1.3 0.2 setosa
## 3 6.3 3.3 6.0 2.5 virginica
## 4 4.4 3.2 1.3 0.2 setosa
## 5 6.9 3.1 5.4 2.1 virginica
## 6 5.0 3.5 1.6 0.6 setosa
## 7 6.0 2.9 4.5 1.5 versicolor
## 8 5.5 2.3 4.0 1.3 versicolor
## 9 5.1 3.8 1.5 0.3 setosa
## 10 6.5 2.8 4.6 1.5 versicolor
iris %>% sample_frac(size = 0.02)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 4.4 3.2 1.3 0.2 setosa
## 2 5.6 2.7 4.2 1.3 versicolor
## 3 5.7 2.5 5.0 2.0 virginica
library(pryr)
mem_change(x <- rnorm(10000))
## 81.3 kB
mem_change(x[1] <- 0)
## 736 B
mem_change(y <- x)
## -120 B
mem_change(x[1] <- 0)
## 80.8 kB
d <- data.frame(x = rnorm(10000), y = rnorm(10000))
d %>% ggplot(aes(x = x, y = y)) + geom_point(alpha = 0.2)
mem_change(d)
## 528 B
d %>% ggplot(aes(x = x, y = y)) + geom_density_2d()
mem_change(d)
## -720 B
#install.packages("hexbin")
d %>% ggplot(aes(x = x, y = y)) + geom_hex() + scale_fill_gradient(low = "lightgray", high = "red") + geom_density2d(color = "black")
d %>% ggplot(aes(x = x, y = y)) + geom_hex()
library(ff)
ffcars <- as.ffdf(cars)
summary(ffcars)
## Length Class Mode
## speed 50 ff_vector list
## dist 50 ff_vector list
ffiris <- as.ffdf(iris)
summary(ffiris)
## Length Class Mode
## Sepal.Length 150 ff_vector list
## Sepal.Width 150 ff_vector list
## Petal.Length 150 ff_vector list
## Petal.Width 150 ff_vector list
## Species 150 ff_vector list
library(biglm)
model <- biglm(dist ~ speed, data = ffcars)
summary(model)
## Large data regression model: biglm(dist ~ speed, data = ffcars)
## Sample size = 50
## Coef (95% CI) SE p
## (Intercept) -17.5791 -31.0960 -4.0622 6.7584 0.0093
## speed 3.9324 3.1014 4.7634 0.4155 0.0000
iris_db <- src_sqlite("iris_db.sqlite3", create = TRUE)
iris_sqlite <- tbl(iris_db, "iris", temporary = FALSE)
iris_sqlite %>% group_by(Species) %>%
summarise(mean.Petal.Length = mean(Petal.Length))
## # Source: SQL [?? x 2]
## # Database: sqlite 3.47.1 [C:\Users\HP\Desktop\Data Science\Charles_Nana\iris_db.sqlite3]
## Species mean.Petal.Length
## <chr> <dbl>
## 1 setosa 1.46
## 2 versicolor 4.26
## 3 virginica 5.55
## **Chapter 6**
## **Supervise Machine Learning**
### Linear Regression
cars %>% head()
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
cars %>% ggplot(aes(x = speed, y = dist)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x)
cars %>% lm(dist ~ speed, data = .) %>% summary
##
## Call:
## lm(formula = dist ~ speed, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
cars %>% lm(dist ~ speed, data = .) %>% coefficients
## (Intercept) speed
## -17.579095 3.932409
cars %>% lm(dist ~ speed, data = .) %>% confint
## 2.5 % 97.5 %
## (Intercept) -31.167850 -3.990340
## speed 3.096964 4.767853
predict_dist <- function(speed, theta_1)
data.frame(speed = speed,
dist = theta_1 * speed,
theta = as.factor(theta_1))
cars %>% ggplot(aes(x = speed, y = dist, colour = theta)) +
geom_point(colour = "black") +
geom_line(data = predict_dist(cars$speed, 2)) +
geom_line(data = predict_dist(cars$speed, 3)) +
geom_line(data = predict_dist(cars$speed, 4)) +
scale_color_discrete(name = expression(theta[1]))
thetas <- seq(0, 5, length.out = 50)
fitting_error <- Vectorize(function(theta)
sum((theta * cars$speed - cars$dist)**2)
)
data.frame(thetas = thetas, errors = fitting_error(thetas)) %>%
ggplot(aes(x = thetas, y = errors)) +
geom_line() +
xlab(expression(theta[1])) + ylab("")
cars %>% lm(dist ~ speed - 1, data = .) %>% coefficients
## speed
## 2.909132
cars %>% ggplot(aes(x = speed, y = dist)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x - 1)
### Logistic Regression
data("BreastCancer")
BreastCancer %>% head
## Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size
## 1 1000025 5 1 1 1 2
## 2 1002945 5 4 4 5 7
## 3 1015425 3 1 1 1 2
## 4 1016277 6 8 8 1 3
## 5 1017023 4 1 1 3 2
## 6 1017122 8 10 10 8 7
## Bare.nuclei Bl.cromatin Normal.nucleoli Mitoses Class
## 1 1 3 1 1 benign
## 2 10 3 2 1 benign
## 3 2 3 1 1 benign
## 4 4 3 7 1 benign
## 5 1 3 1 1 benign
## 6 10 9 7 1 malignant
BreastCancer %>% ggplot(aes(x = Cl.thickness, y = Class)) +
geom_jitter(height = 0.05, width = 0.3, alpha = 0.4)
#data("BreastCancer")
#BreastCancer %>% sample_frac(0.4) %>% head
BreastCancer %>% ggplot(aes(x = Cell.size, y = Class)) +
geom_jitter(height = 0.05, width = .3, alpha = 0.4)
BreastCancer %>%
mutate(Cl.thickness.numeric =
as.numeric(as.character(Cl.thickness))) %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1)) %>%
ggplot(aes(x = Cl.thickness.numeric, y = IsMalignant)) + geom_jitter(height = 0.05, width = 0.3, alpha=0.4) +
geom_smooth(method = "glm", formula = y ~ x,
method.args = list(family = "binomial"))
BreastCancer %>% mutate(Cl.thickness.numeric =
as.numeric(as.character(Cl.thickness))) %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1)) %>%
glm(IsMalignant ~ Cl.thickness.numeric,
family = "binomial", data = .)
##
## Call: glm(formula = IsMalignant ~ Cl.thickness.numeric, family = "binomial",
## data = .)
##
## Coefficients:
## (Intercept) Cl.thickness.numeric
## -5.1602 0.9355
##
## Degrees of Freedom: 698 Total (i.e. Null); 697 Residual
## Null Deviance: 900.5
## Residual Deviance: 464.1 AIC: 468.1
### Model Matrice and Formula
cars %>% model.matrix(dist ~ speed, data = .) %>% head(5)
## (Intercept) speed
## 1 1 4
## 2 1 4
## 3 1 7
## 4 1 7
## 5 1 8
cars %>% model.matrix(dist ~ speed - 1, data = .) %>% head(5)
## speed
## 1 4
## 2 4
## 3 7
## 4 7
## 5 8
BreastCancer %>%
mutate(Cl.thickness.numeric =
as.numeric(as.character(Cl.thickness)),
Cell.size.numeric =
as.numeric(as.character(Cell.size))) %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1)) %>%
model.matrix(IsMalignant ~ Cl.thickness.numeric + Cell.size.numeric,
data = .) %>% head()
## (Intercept) Cl.thickness.numeric Cell.size.numeric
## 1 1 5 1
## 2 1 5 4
## 3 1 3 1
## 4 1 6 8
## 5 1 4 1
## 6 1 8 10
BreastCancer %>%
mutate(Cl.thickness.numeric =
as.numeric(as.character(Cl.thickness)),
Cell.size.numeric =
as.numeric(as.character(Cell.size))) %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1)) %>%
model.matrix(IsMalignant ~ Cl.thickness.numeric + Cell.size.numeric - 1,
data = .) %>% head()
## Cl.thickness.numeric Cell.size.numeric
## 1 5 1
## 2 5 4
## 3 3 1
## 4 6 8
## 5 4 1
## 6 8 10
BreastCancer %>%
mutate(Cl.thickness.numeric =
as.numeric(as.character(Cl.thickness)),
Cell.size.numeric =
as.numeric(as.character(Cell.size))) %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1)) %>%
glm(IsMalignant ~ Cl.thickness.numeric + Cell.size.numeric - 1, family = "binomial", data = .)
##
## Call: glm(formula = IsMalignant ~ Cl.thickness.numeric + Cell.size.numeric -
## 1, family = "binomial", data = .)
##
## Coefficients:
## Cl.thickness.numeric Cell.size.numeric
## -0.3182 0.5624
##
## Degrees of Freedom: 699 Total (i.e. Null); 697 Residual
## Null Deviance: 969
## Residual Deviance: 798.8 AIC: 802.8
BreastCancer %>%
mutate(Cl.thickness.numeric =
as.numeric(as.character(Cl.thickness)),
Cell.size.numeric =
as.numeric(as.character(Cell.size))) %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1)) %>%
glm(IsMalignant ~ Cl.thickness.numeric + Cell.size.numeric,
family = "binomial", data = .)
##
## Call: glm(formula = IsMalignant ~ Cl.thickness.numeric + Cell.size.numeric,
## family = "binomial", data = .)
##
## Coefficients:
## (Intercept) Cl.thickness.numeric Cell.size.numeric
## -7.1517 0.6174 1.1751
##
## Degrees of Freedom: 698 Total (i.e. Null); 696 Residual
## Null Deviance: 900.5
## Residual Deviance: 212.3 AIC: 218.3
BreastCancer %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1)) %>%
model.matrix(IsMalignant ~ Bare.nuclei, data = .) %>%
head()
## (Intercept) Bare.nuclei2 Bare.nuclei3 Bare.nuclei4 Bare.nuclei5 Bare.nuclei6
## 1 1 0 0 0 0 0
## 2 1 0 0 0 0 0
## 3 1 1 0 0 0 0
## 4 1 0 0 1 0 0
## 5 1 0 0 0 0 0
## 6 1 0 0 0 0 0
## Bare.nuclei7 Bare.nuclei8 Bare.nuclei9 Bare.nuclei10
## 1 0 0 0 0
## 2 0 0 0 1
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 1
BreastCancer %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1)) %>%
model.matrix(IsMalignant ~ Cl.thickness, data = .) %>%
head()
## (Intercept) Cl.thickness.L Cl.thickness.Q Cl.thickness.C Cl.thickness^4
## 1 1 -0.05504819 -0.34815531 0.1295501 0.33658092
## 2 1 -0.05504819 -0.34815531 0.1295501 0.33658092
## 3 1 -0.27524094 -0.08703883 0.3778543 -0.31788198
## 4 1 0.05504819 -0.34815531 -0.1295501 0.33658092
## 5 1 -0.16514456 -0.26111648 0.3346710 0.05609682
## 6 1 0.27524094 -0.08703883 -0.3778543 -0.31788198
## Cl.thickness^5 Cl.thickness^6 Cl.thickness^7 Cl.thickness^8 Cl.thickness^9
## 1 -0.21483446 -0.3113996 0.3278724 0.2617852 -0.5714300
## 2 -0.21483446 -0.3113996 0.3278724 0.2617852 -0.5714300
## 3 -0.03580574 0.3892495 -0.5035184 0.3739788 -0.1632657
## 4 0.21483446 -0.3113996 -0.3278724 0.2617852 0.5714300
## 5 -0.39386318 0.2335497 0.2459043 -0.5235703 0.3809534
## 6 0.03580574 0.3892495 0.5035184 0.3739788 0.1632657
BreastCancer %>%
mutate(Cl.thickness.numeric =
as.numeric(as.character(Cl.thickness)),Cell.size.numeric =
as.numeric(as.character(Cell.size))) %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1)) %>%
model.matrix(IsMalignant ~ Cl.thickness.numeric * Cell.size.numeric, data = .) %>% head()
## (Intercept) Cl.thickness.numeric Cell.size.numeric
## 1 1 5 1
## 2 1 5 4
## 3 1 3 1
## 4 1 6 8
## 5 1 4 1
## 6 1 8 10
## Cl.thickness.numeric:Cell.size.numeric
## 1 5
## 2 20
## 3 3
## 4 48
## 5 4
## 6 80
BreastCancer %>%
mutate(Cl.thickness.numeric =
as.numeric(as.character(Cl.thickness))) %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1)) %>%
model.matrix(IsMalignant ~ Cl.thickness.numeric * Bare.nuclei, data = .) %>% head()
## (Intercept) Cl.thickness.numeric Bare.nuclei2 Bare.nuclei3 Bare.nuclei4
## 1 1 5 0 0 0
## 2 1 5 0 0 0
## 3 1 3 1 0 0
## 4 1 6 0 0 1
## 5 1 4 0 0 0
## 6 1 8 0 0 0
## Bare.nuclei5 Bare.nuclei6 Bare.nuclei7 Bare.nuclei8 Bare.nuclei9
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## Bare.nuclei10 Cl.thickness.numeric:Bare.nuclei2
## 1 0 0
## 2 1 0
## 3 0 3
## 4 0 0
## 5 0 0
## 6 1 0
## Cl.thickness.numeric:Bare.nuclei3 Cl.thickness.numeric:Bare.nuclei4
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 6
## 5 0 0
## 6 0 0
## Cl.thickness.numeric:Bare.nuclei5 Cl.thickness.numeric:Bare.nuclei6
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Cl.thickness.numeric:Bare.nuclei7 Cl.thickness.numeric:Bare.nuclei8
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Cl.thickness.numeric:Bare.nuclei9 Cl.thickness.numeric:Bare.nuclei10
## 1 0 0
## 2 0 5
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 8
BreastCancer %>%
mutate(Cl.thickness.numeric =
as.numeric(as.character(Cl.thickness))) %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1)) %>%
model.matrix(IsMalignant ~ Cl.thickness.numeric : Bare.nuclei, data = .) %>% head()
## (Intercept) Cl.thickness.numeric:Bare.nuclei1
## 1 1 5
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 4
## 6 1 0
## Cl.thickness.numeric:Bare.nuclei2 Cl.thickness.numeric:Bare.nuclei3
## 1 0 0
## 2 0 0
## 3 3 0
## 4 0 0
## 5 0 0
## 6 0 0
## Cl.thickness.numeric:Bare.nuclei4 Cl.thickness.numeric:Bare.nuclei5
## 1 0 0
## 2 0 0
## 3 0 0
## 4 6 0
## 5 0 0
## 6 0 0
## Cl.thickness.numeric:Bare.nuclei6 Cl.thickness.numeric:Bare.nuclei7
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Cl.thickness.numeric:Bare.nuclei8 Cl.thickness.numeric:Bare.nuclei9
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Cl.thickness.numeric:Bare.nuclei10
## 1 0
## 2 5
## 3 0
## 4 0
## 5 0
## 6 8
cars %>% model.matrix(dist ~ speed + speed^2, data = .) %>% head
## (Intercept) speed
## 1 1 4
## 2 1 4
## 3 1 7
## 4 1 7
## 5 1 8
## 6 1 9
### Polynomial function
cars %>% model.matrix(dist ~ speed + I(speed^2), data = .) %>% head
## (Intercept) speed I(speed^2)
## 1 1 4 16
## 2 1 4 16
## 3 1 7 49
## 4 1 7 49
## 5 1 8 64
## 6 1 9 81
cars %>% lm(dist ~ speed + I(speed^2), data = .) %>%
summary
##
## Call:
## lm(formula = dist ~ speed + I(speed^2), data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.720 -9.184 -3.188 4.628 45.152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.47014 14.81716 0.167 0.868
## speed 0.91329 2.03422 0.449 0.656
## I(speed^2) 0.09996 0.06597 1.515 0.136
##
## Residual standard error: 15.18 on 47 degrees of freedom
## Multiple R-squared: 0.6673, Adjusted R-squared: 0.6532
## F-statistic: 47.14 on 2 and 47 DF, p-value: 5.852e-12
cars %>% ggplot(aes(x = speed, y = dist)) + geom_point() +
geom_smooth(method = "lm", formula = y ~ x + I(x^2))
iris %>% model.matrix(Sepal.Width ~ Sepal.Length, data = .) %>% head
## (Intercept) Sepal.Length
## 1 1 5.1
## 2 1 4.9
## 3 1 4.7
## 4 1 4.6
## 5 1 5.0
## 6 1 5.4
iris %>% model.matrix(Sepal.Width ~ Sepal.Length + I(Sepal.Length^2), data=.) %>% head()
## (Intercept) Sepal.Length I(Sepal.Length^2)
## 1 1 5.1 26.01
## 2 1 4.9 24.01
## 3 1 4.7 22.09
## 4 1 4.6 21.16
## 5 1 5.0 25.00
## 6 1 5.4 29.16
iris %>% lm(Sepal.Width ~ Sepal.Length + I(Sepal.Length^2), data = .) %>% summary
##
## Call:
## lm(formula = Sepal.Width ~ Sepal.Length + I(Sepal.Length^2),
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.13070 -0.26310 -0.02446 0.25728 1.38725
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.41584 1.58499 4.048 8.33e-05 ***
## Sepal.Length -1.08556 0.53625 -2.024 0.0447 *
## I(Sepal.Length^2) 0.08571 0.04476 1.915 0.0574 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4304 on 147 degrees of freedom
## Multiple R-squared: 0.03783, Adjusted R-squared: 0.02474
## F-statistic: 2.89 on 2 and 147 DF, p-value: 0.05877
iris %>% ggplot(aes(x = Sepal.Width, y = Sepal.Length)) + geom_point( color = "lightgreen") +
geom_smooth(method = "lm", formula = y ~ x + I(x^2))
### Validating Models
line <- cars %>% lm(dist ~ speed, data = .)
poly <- cars %>% lm(dist ~ speed + I(speed^2), data = .)
predict(line, cars) %>% head
## 1 2 3 4 5 6
## -1.849460 -1.849460 9.947766 9.947766 13.880175 17.812584
predict(poly, cars) %>% head
## 1 2 3 4 5 6
## 7.722637 7.722637 13.761157 13.761157 16.173834 18.786430
### Evaluating Regression Models
rmse <- function(x,t) sqrt(mean(sum((t - x)^2)))
rmse(predict(line, cars), cars$dist)
## [1] 106.5529
rmse(predict(poly, cars), cars$dist)
## [1] 104.0419
training_data <- cars[1:25,]
test_data <- cars[26:50,]
line <- training_data %>% lm(dist ~ speed, data = .)
poly <- training_data %>% lm(dist ~ speed + I(speed^2), data = .)
rmse(predict(line, test_data), test_data$dist)
## [1] 88.89189
rmse(predict(poly, test_data), test_data$dist)
## [1] 83.84263
sampled_cars <- cars %>% mutate(training = sample(0:1, nrow(cars), replace = TRUE))
sampled_cars %>% head
## speed dist training
## 1 4 2 0
## 2 4 10 0
## 3 7 4 1
## 4 7 22 0
## 5 8 16 0
## 6 9 10 1
training_data <- sampled_cars %>% filter(training == 1)
test_data <- sampled_cars %>% filter(training == 0)
training_data %>% head
## speed dist training
## 1 7 4 1
## 2 9 10 1
## 3 10 26 1
## 4 11 17 1
## 5 11 28 1
## 6 12 20 1
test_data %>% head
## speed dist training
## 1 4 2 0
## 2 4 10 0
## 3 7 22 0
## 4 8 16 0
## 5 10 18 0
## 6 10 34 0
line <- training_data %>% lm(dist ~ speed, data = .)
poly <- training_data %>% lm(dist ~ speed + I(speed^2), data = .)
rmse(predict(line, test_data), test_data$dist)
## [1] 93.54925
rmse(predict(poly, test_data), test_data$dist)
## [1] 87.80469
### Evaluating Classification Models
formatted_data <- BreastCancer %>%
mutate(Cl.thickness.numeric =
as.numeric(as.character(Cl.thickness)),
Cell.size.numeric =
as.numeric(as.character(Cell.size))) %>%
mutate(IsMalignant = ifelse(Class == "benign", 0, 1))
fitted_model <- formatted_data %>%
glm(IsMalignant ~ Cl.thickness.numeric + Cell.size.numeric, data = .)
predict(fitted_model, formatted_data, type = "response") %>% head
## 1 2 3 4 5 6
## 0.17365879 0.45878552 0.06461445 0.89347668 0.11913662 1.19260550
### Confusion Matrix
classify <- function(probability) ifelse(probability < 0.5, 0, 1)
classified_malignant <- classify(predict(fitted_model, formatted_data))
table(formatted_data$IsMalignant, classified_malignant)
## classified_malignant
## 0 1
## 0 450 8
## 1 42 199
table(formatted_data$IsMalignant, classified_malignant,
dnn = c("Data", "Predictions"))
## Predictions
## Data 0 1
## 0 450 8
## 1 42 199
classify <- function(probability)
ifelse(probability < 0.5, "benign", "malignant")
classified <- classify(predict(fitted_model, formatted_data))
table(formatted_data$Class, classified, dnn = c("Data", "Predictions"))
## Predictions
## Data benign malignant
## benign 450 8
## malignant 42 199
### Accuracy
confusion_matrix <- table(formatted_data$Class, classified,dnn = c("Data", "Predictions"))
(accuracy <- sum(diag(confusion_matrix))/sum(confusion_matrix))
## [1] 0.9284692
#table(BreastCancer$Class)
tbl <- table(BreastCancer$Class)
tbl["benign"] / sum(tbl)
## benign
## 0.6552217
mbl = table (BreastCancer$Class)
mbl["malignant"] / sum(mbl)
## malignant
## 0.3447783
table(BreastCancer$Class, sample(BreastCancer$Class)) # random sampling
##
## benign malignant
## benign 296 162
## malignant 162 79
#replicate(8, acc(table(BreastCancer$Class,sample(BreastCancer$Class))))
#(accuracy(table(BreastCancer$Class,sample(BreastCancer$Class))))
### Sensitivity and Specificity
(specificity <- confusion_matrix[1,1]/
(confusion_matrix[1,1] + confusion_matrix[1,2])) # negative class
## [1] 0.9825328
(sensitivity <- confusion_matrix[2,2]/
(confusion_matrix[2,1] + confusion_matrix[2,2])) # positive class
## [1] 0.8257261
specificity <- function(confusion_matrix)
confusion_matrix[1,1]/(confusion_matrix[1,1] + confusion_matrix[1,2])
sensitivity <- function(confusion_matrix)
confusion_matrix[2,2]/(confusion_matrix[2,1] + confusion_matrix[2,2])
prediction_summary <- function(confusion_matrix)
c("accuracy" = accuracy(confusion_matrix),
"specificity" = specificity(confusion_matrix),
"sensitivity" = sensitivity(confusion_matrix))
random_prediction_summary <- function()
prediction_summary(table(BreastCancer$Class,
sample(BreastCancer$Class)))
#replicate(3, random_prediction_summary())
### Other Measures
confusion_matrix[2,1] / sum(confusion_matrix[,1])
## [1] 0.08536585
confusion_matrix[1,1] / sum(confusion_matrix[,1])
## [1] 0.9146341
confusion_matrix[2,2] / sum(confusion_matrix[,2])
## [1] 0.9613527
confusion_matrix[1,2] / sum(confusion_matrix[,2])
## [1] 0.03864734
### Random Permutation of Data
permuted_cars <- cars[sample(1:nrow(cars)),]
permuted_cars %>% head()
## speed dist
## 39 20 32
## 8 10 26
## 30 17 40
## 6 9 10
## 18 13 34
## 5 8 16
permute_rows <- function(df) df[sample(1:nrow(df)),]
permuted_cars <- cars %>% permute_rows
group_data <- function(df, n) {
groups <- rep(1:n, each = nrow(df)/n)
split(df, groups)
}
cars %>% permute_rows %>% group_data(5) %>% head(1)
## $`1`
## speed dist
## 9 10 34
## 49 24 120
## 7 10 18
## 2 4 10
## 25 15 26
## 4 7 22
## 20 14 26
## 14 12 24
## 5 8 16
## 11 11 28
grouped_cars <- cars %>% permute_rows %>% group_data(5)
grouped_cars[[1]]
## speed dist
## 33 18 56
## 40 20 48
## 6 9 10
## 43 20 64
## 34 18 76
## 37 19 46
## 3 7 4
## 25 15 26
## 13 12 20
## 18 13 34
grouped_cars[1]
## $`1`
## speed dist
## 33 18 56
## 40 20 48
## 6 9 10
## 43 20 64
## 34 18 76
## 37 19 46
## 3 7 4
## 25 15 26
## 13 12 20
## 18 13 34
(grouped_cars[[1]] %>%
lm(dist ~ speed, data = .) %>%.$coefficients)
## (Intercept) speed
## -29.20874 4.47740
qplot(grouped_cars[[1]] %>%lm(dist ~ speed, data = .) %>%.$coefficients)
estimates <- grouped_cars[[1]] %>%
lm(dist ~ speed, data = .) %>%.$coefficients
for (i in 2:length(grouped_cars)) {
group_estimates <- grouped_cars[[i]] %>%
lm(dist ~ speed, data = .) %>%
.$coefficients
estimates <- rbind(estimates, group_estimates)
}
estimates
## (Intercept) speed
## estimates -29.208735 4.477400
## group_estimates -17.539144 4.614327
## group_estimates 16.330709 2.244094
## group_estimates -9.499446 2.892580
## group_estimates -20.833333 3.989035
library(purrr)
estimates <- grouped_cars %>%
map(. %>% lm(dist ~ speed, data = .) %>% .$coefficients)
estimates
## $`1`
## (Intercept) speed
## -29.20874 4.47740
##
## $`2`
## (Intercept) speed
## -17.539144 4.614327
##
## $`3`
## (Intercept) speed
## 16.330709 2.244094
##
## $`4`
## (Intercept) speed
## -9.499446 2.892580
##
## $`5`
## (Intercept) speed
## -20.833333 3.989035
estimates <- grouped_cars %>%
map(. %>% lm(dist ~ speed, data = .) %>% .$coefficients) %>% do.call("rbind", .)
estimates
## (Intercept) speed
## 1 -29.208735 4.477400
## 2 -17.539144 4.614327
## 3 16.330709 2.244094
## 4 -9.499446 2.892580
## 5 -20.833333 3.989035
### Cross Validation
cross_validation_groups <- function(grouped_df) {
result <- vector(mode = "list", length = length(grouped_df))
for (i in seq_along(grouped_df)) {
result[[i]] <- grouped_df[-i] %>% do.call("rbind", .)
}
result
}
cars %>% permute_rows %>% group_data(5) %>% cross_validation_groups %>% map(. %>% lm(dist ~ speed, data = .) %>% .$coefficients) %>% do.call("rbind", .)
## (Intercept) speed
## [1,] -19.21612 4.209059
## [2,] -25.09475 4.375456
## [3,] -16.60570 3.894515
## [4,] -17.04083 3.752234
## [5,] -11.50681 3.509767
cross_validation_split <- function(grouped_df) {
result <- vector(mode = "list", length = length(grouped_df))
for (i in seq_along(grouped_df)) {
training <- grouped_df[-i] %>% do.call("rbind", .)
test <- grouped_df[[i]]
result[[i]] <- list(training = training, test = test)
}
result
}
cars %>%
permute_rows %>%
group_data(5) %>%
cross_validation_split
## [[1]]
## [[1]]$training
## speed dist
## 2.31 17 50
## 2.8 10 26
## 2.37 19 46
## 2.5 8 16
## 2.23 14 80
## 2.39 20 32
## 2.50 25 85
## 2.34 18 76
## 2.15 12 28
## 2.22 14 60
## 3.11 11 28
## 3.44 22 66
## 3.21 14 36
## 3.33 18 56
## 3.40 20 48
## 3.2 4 10
## 3.12 12 14
## 3.14 12 24
## 3.47 24 92
## 3.32 18 42
## 4.4 7 22
## 4.20 14 26
## 4.19 13 46
## 4.38 19 68
## 4.18 13 34
## 4.46 24 70
## 4.6 9 10
## 4.13 12 20
## 4.42 20 56
## 4.1 4 2
## 5.43 20 64
## 5.7 10 18
## 5.10 11 17
## 5.30 17 40
## 5.17 13 34
## 5.35 18 84
## 5.48 24 93
## 5.24 15 20
## 5.16 13 26
## 5.45 23 54
##
## [[1]]$test
## speed dist
## 25 15 26
## 27 16 32
## 26 15 54
## 49 24 120
## 3 7 4
## 41 20 52
## 29 17 32
## 28 16 40
## 9 10 34
## 36 19 36
##
##
## [[2]]
## [[2]]$training
## speed dist
## 1.25 15 26
## 1.27 16 32
## 1.26 15 54
## 1.49 24 120
## 1.3 7 4
## 1.41 20 52
## 1.29 17 32
## 1.28 16 40
## 1.9 10 34
## 1.36 19 36
## 3.11 11 28
## 3.44 22 66
## 3.21 14 36
## 3.33 18 56
## 3.40 20 48
## 3.2 4 10
## 3.12 12 14
## 3.14 12 24
## 3.47 24 92
## 3.32 18 42
## 4.4 7 22
## 4.20 14 26
## 4.19 13 46
## 4.38 19 68
## 4.18 13 34
## 4.46 24 70
## 4.6 9 10
## 4.13 12 20
## 4.42 20 56
## 4.1 4 2
## 5.43 20 64
## 5.7 10 18
## 5.10 11 17
## 5.30 17 40
## 5.17 13 34
## 5.35 18 84
## 5.48 24 93
## 5.24 15 20
## 5.16 13 26
## 5.45 23 54
##
## [[2]]$test
## speed dist
## 31 17 50
## 8 10 26
## 37 19 46
## 5 8 16
## 23 14 80
## 39 20 32
## 50 25 85
## 34 18 76
## 15 12 28
## 22 14 60
##
##
## [[3]]
## [[3]]$training
## speed dist
## 1.25 15 26
## 1.27 16 32
## 1.26 15 54
## 1.49 24 120
## 1.3 7 4
## 1.41 20 52
## 1.29 17 32
## 1.28 16 40
## 1.9 10 34
## 1.36 19 36
## 2.31 17 50
## 2.8 10 26
## 2.37 19 46
## 2.5 8 16
## 2.23 14 80
## 2.39 20 32
## 2.50 25 85
## 2.34 18 76
## 2.15 12 28
## 2.22 14 60
## 4.4 7 22
## 4.20 14 26
## 4.19 13 46
## 4.38 19 68
## 4.18 13 34
## 4.46 24 70
## 4.6 9 10
## 4.13 12 20
## 4.42 20 56
## 4.1 4 2
## 5.43 20 64
## 5.7 10 18
## 5.10 11 17
## 5.30 17 40
## 5.17 13 34
## 5.35 18 84
## 5.48 24 93
## 5.24 15 20
## 5.16 13 26
## 5.45 23 54
##
## [[3]]$test
## speed dist
## 11 11 28
## 44 22 66
## 21 14 36
## 33 18 56
## 40 20 48
## 2 4 10
## 12 12 14
## 14 12 24
## 47 24 92
## 32 18 42
##
##
## [[4]]
## [[4]]$training
## speed dist
## 1.25 15 26
## 1.27 16 32
## 1.26 15 54
## 1.49 24 120
## 1.3 7 4
## 1.41 20 52
## 1.29 17 32
## 1.28 16 40
## 1.9 10 34
## 1.36 19 36
## 2.31 17 50
## 2.8 10 26
## 2.37 19 46
## 2.5 8 16
## 2.23 14 80
## 2.39 20 32
## 2.50 25 85
## 2.34 18 76
## 2.15 12 28
## 2.22 14 60
## 3.11 11 28
## 3.44 22 66
## 3.21 14 36
## 3.33 18 56
## 3.40 20 48
## 3.2 4 10
## 3.12 12 14
## 3.14 12 24
## 3.47 24 92
## 3.32 18 42
## 5.43 20 64
## 5.7 10 18
## 5.10 11 17
## 5.30 17 40
## 5.17 13 34
## 5.35 18 84
## 5.48 24 93
## 5.24 15 20
## 5.16 13 26
## 5.45 23 54
##
## [[4]]$test
## speed dist
## 4 7 22
## 20 14 26
## 19 13 46
## 38 19 68
## 18 13 34
## 46 24 70
## 6 9 10
## 13 12 20
## 42 20 56
## 1 4 2
##
##
## [[5]]
## [[5]]$training
## speed dist
## 1.25 15 26
## 1.27 16 32
## 1.26 15 54
## 1.49 24 120
## 1.3 7 4
## 1.41 20 52
## 1.29 17 32
## 1.28 16 40
## 1.9 10 34
## 1.36 19 36
## 2.31 17 50
## 2.8 10 26
## 2.37 19 46
## 2.5 8 16
## 2.23 14 80
## 2.39 20 32
## 2.50 25 85
## 2.34 18 76
## 2.15 12 28
## 2.22 14 60
## 3.11 11 28
## 3.44 22 66
## 3.21 14 36
## 3.33 18 56
## 3.40 20 48
## 3.2 4 10
## 3.12 12 14
## 3.14 12 24
## 3.47 24 92
## 3.32 18 42
## 4.4 7 22
## 4.20 14 26
## 4.19 13 46
## 4.38 19 68
## 4.18 13 34
## 4.46 24 70
## 4.6 9 10
## 4.13 12 20
## 4.42 20 56
## 4.1 4 2
##
## [[5]]$test
## speed dist
## 43 20 64
## 7 10 18
## 10 11 17
## 30 17 40
## 17 13 34
## 35 18 84
## 48 24 93
## 24 15 20
## 16 13 26
## 45 23 54
prediction_accuracy_cars <- function(test_and_training) {
result <- vector(mode = "numeric",
length = length(test_and_training))
for (i in seq_along(test_and_training)) {
training <- test_and_training[[i]]$training
test <- test_and_training[[i]]$test
model <- training %>% lm(dist ~ speed, data = .)
predictions <- test %>% predict(model, data = .)
targets <- test$dist
result[i] <- rmse(targets, predictions)
}
result
}
cars %>%permute_rows %>%group_data(5) %>% cross_validation_split %>%prediction_accuracy_cars
## [1] 242.8901 190.2690 155.4522 246.3786 182.8511
random_group <- function(n, probs) {
probs <- probs / sum(probs)
g <- findInterval(seq(0, 1, length = n), c(0, cumsum(probs)),
rightmost.closed = TRUE)
names(probs)[sample(g)]
}
random_group(8, c(training = 0.5, test = 0.5))
## [1] "training" "training" "test" "training" "test" "training" "test"
## [8] "test"
random_group(8, c(training = 0.6, test = 0.4))
## [1] "test" "training" "training" "training" "test" "test" "training"
## [8] "training"
random_group(8, c(training = 0.8, test = 0.2))
## [1] "training" "training" "training" "test" "training" "training" "test"
## [8] "training"
partition <- function(df, n, probs) {
replicate(n, split(df, random_group(nrow(df), probs)), FALSE)
}
random_cars <- cars %>% partition(4, c(training = 0.5, test = 0.5))
random_cars %>% prediction_accuracy_cars
## [1] 63.03685 84.98622 73.82351 87.96673
### Decision Tree
library(rpart)
model <- cars %>% rpart(dist ~ speed, data = .)
rmse(predict(model, cars), cars$dist) # root mean square
## [1] 117.1626
model <- BreastCancer %>%
rpart(Class ~ Cl.thickness, data = .)
predict(model, BreastCancer) %>% head
## benign malignant
## 1 0.82815356 0.1718464
## 2 0.82815356 0.1718464
## 3 0.82815356 0.1718464
## 4 0.82815356 0.1718464
## 5 0.82815356 0.1718464
## 6 0.03289474 0.9671053
predicted_class <- predict(model, BreastCancer) %>%
as.data.frame %$% ifelse(benign > 0.5, "benign", "malignant")
table(BreastCancer$Class, predicted_class)
## predicted_class
## benign malignant
## benign 453 5
## malignant 94 147
accuracy = sum(453,147) / sum(453,147,5,94)
accuracy
## [1] 0.8583691
#install.packages("party")
library(party)
model <- cars %>% ctree(dist ~ speed, data = .)
rmse(predict(model, cars), cars$dist)
## [1] 117.1626
model <- BreastCancer %>%
ctree(Class ~ Cl.thickness, data = .)
predict(model, BreastCancer) %>% head
## [1] benign benign benign benign benign malignant
## Levels: benign malignant
table(BreastCancer$Class, predict(model, BreastCancer))
##
## benign malignant
## benign 453 5
## malignant 94 147
cars %>% ctree(dist ~ speed, data = .) %>% plot
### Random Forests
library(randomForest)
model <- cars %>% randomForest(dist ~ speed, data = .)
rmse(predict(model, cars), cars$dist) # root mean square error
## [1] 83.52702
model <- BreastCancer %>% randomForest(Class ~ Cl.thickness, data = .)
predict(model, BreastCancer) %>% head
## 1 2 3 4 5 6
## benign benign benign malignant benign malignant
## Levels: benign malignant
table(BreastCancer$Class, predict(model, BreastCancer))
##
## benign malignant
## benign 437 21
## malignant 76 165
accuracy = sum(437,165) / sum(437,165,76,21)
accuracy
## [1] 0.8612303
### Neural Networks
library(nnet)
model <- cars %>% nnet(dist ~ speed, data = ., size = 5)
## # weights: 16
## initial value 121964.440575
## final value 120655.000000
## converged
rmse(predict(model, cars), cars$dist)
## [1] 347.3543
model <- BreastCancer %>% nnet(Class ~ Cl.thickness, data = ., size = 5)
## # weights: 56
## initial value 456.304615
## iter 10 value 228.635702
## iter 20 value 225.141400
## iter 30 value 225.098834
## iter 40 value 225.098489
## iter 50 value 225.098398
## final value 225.098360
## converged
predict(model, BreastCancer) %>% head
## [,1]
## 1 0.3461856
## 2 0.3461856
## 3 0.1111388
## 4 0.5294387
## 5 0.1499583
## 6 0.9130318
predicted_class <- predict(model, BreastCancer) %>%
{ ifelse(. < 0.5, "benign", "malignant") }
table(BreastCancer$Class, predicted_class)
## predicted_class
## benign malignant
## benign 437 21
## malignant 76 165
accuracy = sum(437,165) / sum(437,165,76,21)
accuracy
## [1] 0.8612303
### Support Vector Machines
#install.packages("kernlab")
library(kernlab)
model <- cars %>% ksvm(dist ~ speed, data = .)
rmse(predict(model, cars), cars$dist)
## [1] 100.4652
model <- BreastCancer %>% ksvm(Class ~ Cl.thickness, data = .)
predict(model, BreastCancer) %>% head
## [1] benign benign benign malignant benign malignant
## Levels: benign malignant
table(BreastCancer$Class, predict(model, BreastCancer))
##
## benign malignant
## benign 437 21
## malignant 76 165
### Naive Bayes
library(e1071)
model <- BreastCancer %>% naiveBayes(Class ~ Cl.thickness, data = .)
predict(model, BreastCancer) %>% head
## [1] benign benign benign malignant benign malignant
## Levels: benign malignant
table(BreastCancer$Class, predict(model, BreastCancer))
##
## benign malignant
## benign 437 21
## malignant 76 165
## **Chapter Seven**
### **Unsupervised Learning**
### Principal Component Analysis
iris %>% head()
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
#iris %>% ggplot() + geom_point(aes(x = Sepal.Length, y = Sepal.Width, colour = Species))
#iris %>% ggplot() + geom_point(aes(x = Petal.Length, y = Petal.Width, colour = Species))
pca <- iris %>% select(-Species) %>% prcomp
pca
## Standard deviations (1, .., p=4):
## [1] 2.0562689 0.4926162 0.2796596 0.1543862
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## Sepal.Length 0.36138659 -0.65658877 0.58202985 0.3154872
## Sepal.Width -0.08452251 -0.73016143 -0.59791083 -0.3197231
## Petal.Length 0.85667061 0.17337266 -0.07623608 -0.4798390
## Petal.Width 0.35828920 0.07548102 -0.54583143 0.7536574
pca %>% plot
#?prcomp
mapped_iris <- pca %>% predict(iris)
mapped_iris %>% head
## PC1 PC2 PC3 PC4
## [1,] -2.684126 -0.3193972 0.02791483 0.002262437
## [2,] -2.714142 0.1770012 0.21046427 0.099026550
## [3,] -2.888991 0.1449494 -0.01790026 0.019968390
## [4,] -2.745343 0.3182990 -0.03155937 -0.075575817
## [5,] -2.728717 -0.3267545 -0.09007924 -0.061258593
## [6,] -2.280860 -0.7413304 -0.16867766 -0.024200858
mapped_iris %>%
as.data.frame %>%
cbind(Species = iris$Species) %>%
ggplot() + geom_point(aes(x = PC1, y = PC2, colour = Species))
data(HouseVotes84)
HouseVotes84 %>% head(na.rm = TRUE)
## Class V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1 republican n y n y y y n n n y <NA> y y y n y
## 2 republican n y n y y y n n n n n y y y n <NA>
## 3 democrat <NA> y y <NA> y y n n n n y n y y n n
## 4 democrat n y y n <NA> y n n n n y n y n n y
## 5 democrat y y y n y y n n n n y <NA> y y y y
## 6 democrat n y y n y y n n n n n n y y y y
vote_patterns <- HouseVotes84 %>% select(-Class) %>% apply(c(1,2), . %>% { ifelse(as.character(.) == "n", 0, 1) }) %>% apply(c(1,2), . %>% { ifelse(is.na(.), 0.5, .) })
pca <- vote_patterns %>% prcomp
#pca %>% head
pca %>% plot
mapped_votes <- pca %>% predict(vote_patterns)
mapped_votes %>%as.data.frame %>% cbind(Class = HouseVotes84$Class) %>% ggplot() + geom_point(aes(x = PC1, y = PC2, colour = Class))
### Multidimensional Scaling
iris_dist <- iris %>% select(-Species) %>% dist
mds_iris <- iris_dist %>% cmdscale(k = 2)
mds_iris %>% head
## [,1] [,2]
## [1,] -2.684126 0.3193972
## [2,] -2.714142 -0.1770012
## [3,] -2.888991 -0.1449494
## [4,] -2.745343 -0.3182990
## [5,] -2.728717 0.3267545
## [6,] -2.280860 0.7413304
mds_iris %>% as.data.frame %>%
cbind(Species = iris$Species) %>% ggplot() +
geom_point(aes(x = V1, y = V2, colour = Species))
mds_votes <- vote_patterns %>% dist %>% cmdscale(k = 2 )
mds_votes %>% as.data.frame %>% cbind(Class = HouseVotes84$Class) %>% ggplot() + geom_point(aes(x = V1, y = V2, colour = Class))
random_ngram <- function(n)
sample(c('C','H','A','R','L','E','S'), size = n, replace = TRUE) %>% paste0(collapse = "")
random_string <- function(m) {
n <- max(1, m + sample(c(-1,1), size = 1) * rgeom(1, 1/2))
random_ngram(n)
}
strings <- replicate(10, random_string(5))
#install.packages("stringdist")
library(stringdist)
string_dist <- stringdistmatrix(strings)
string_dist %>% cmdscale(k = 4) %>% as.data.frame %>%
cbind(String = strings) %>% ggplot(aes(x = V1, y = V2)) +
geom_point() + geom_label(aes(label = String), hjust = 0, nudge_y = -0.1)
### Kmeans
clusters <- iris %>% select(-Species) %>% kmeans(centers = 3)
clusters$centers
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 5.901613 2.748387 4.393548 1.433871
## 3 6.850000 3.073684 5.742105 2.071053
#clusters$cluster %>% head
clusters$cluster %>% table
## .
## 1 2 3
## 50 62 38
iris %>% cbind(Cluster = clusters$cluster) %>%
ggplot() + geom_bar(aes(x = Species, fill = as.factor(Cluster)), position = "dodge") + scale_fill_discrete("Cluster")
pca <- iris %>% select(-Species) %>% prcomp
mapped_iris <- pca %>% predict(iris)
mapped_centers <- pca %>% predict(clusters$centers)
mapped_iris %>%
as.data.frame %>%
cbind(Species = iris$Species,
Clusters = as.factor(clusters$cluster)) %>%
ggplot() +
geom_point(aes(x = PC1, y = PC2,
colour = Species, shape = Clusters)) +
geom_point(aes(x = PC1, y = PC2),
size = 5, shape = "X",
data = as.data.frame(mapped_centers))
table(iris$Species, clusters$cluster)
##
## 1 2 3
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 14 36
tbl <- table(iris$Species, clusters$cluster)
(counts <- apply(tbl, 1, which.max))
## setosa versicolor virginica
## 1 2 3
tbl <- table(iris$Species, clusters$cluster)
(counts <- apply(tbl, 1, which.min))
## setosa versicolor virginica
## 2 1 1
map <- rep(NA, each = 3)
map[counts] <- names(counts)
table(iris$Species, map[clusters$cluster])
##
## setosa virginica
## setosa 0 50
## versicolor 48 0
## virginica 14 0
#?prcomp
## Hierarchical Clustering
iris_dist <- iris %>% select(-Species) %>% scale %>% dist
clustering <- hclust(iris_dist)
plot(clustering)
library(ggdendro)
ggdendrogram(clustering) + theme_dendro()
clusters <- clustering %>% cutree(k = 2)
iris %>% cbind(Cluster = clusters) %>%
ggplot() + geom_bar(aes(x = Species, fill = as.factor(Cluster)), position = "dodge") +
scale_fill_discrete("Cluster")
mapped_iris %>% as.data.frame %>%
cbind(Species = iris$Species,
Clusters = as.factor(clusters)) %>%
ggplot() + geom_point(aes(x = PC1, y = PC2,
shape = Species, colour = Clusters))
## Association Rules
library(arules)
data(income)
income %>% head
## INCOME SEX MARITAL.STATUS AGE EDUCATION
## 1 [75.000- F Married 45-54 1 to 3 years of college
## 2 [75.000- M Married 45-54 College graduate
## 3 [75.000- F Married 25-34 College graduate
## 4 -10.000) F Single 14-17 Grades 9 to 11
## 5 -10.000) F Single 14-17 Grades 9 to 11
## 6 [50.000-75.000) M Married 55-64 1 to 3 years of college
## OCCUPATION AREA DUAL.INCOMES HOUSEHOLD.SIZE UNDER18
## 1 Homemaker 10+ years No Three None
## 2 Homemaker 10+ years No Five Two
## 3 Professional/Managerial 10+ years Yes Three One
## 4 Student, HS or College 10+ years Not Married Four Two
## 5 Student, HS or College 4-6 years Not Married Four Two
## 6 Retired 10+ years No Two None
## HOUSEHOLDER HOME.TYPE ETHNIC.CLASS LANGUAGE
## 1 Own House White <NA>
## 2 Own House White English
## 3 Rent Apartment White English
## 4 Family House White English
## 5 Family House White English
## 6 Own House White English
data(Income)
Income %>% head
## transactions in sparse format with
## 6 transactions (rows) and
## 50 items (columns)
rules <- income %>% apriori
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 899
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[84 item(s), 8993 transaction(s)] done [0.01s].
## sorting and recoding items ... [42 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 done [0.04s].
## writing ... [785 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
rules %>% head %>% inspect(linebreak = FALSE)
## lhs rhs support
## [1] {} => {LANGUAGE=English} 0.8666741
## [2] {EDUCATION=Grad Study} => {LANGUAGE=English} 0.1000778
## [3] {OCCUPATION=Clerical/Service Worker} => {LANGUAGE=English} 0.1046369
## [4] {INCOME=[30.000-40.000)} => {LANGUAGE=English} 0.1111976
## [5] {UNDER18=Two} => {LANGUAGE=English} 0.1073057
## [6] {INCOME=[50.000-75.000)} => {LANGUAGE=English} 0.1329923
## confidence coverage lift count
## [1] 0.8666741 1.0000000 1.0000000 7794
## [2] 0.9316770 0.1074169 1.0750027 900
## [3] 0.8860640 0.1180918 1.0223728 941
## [4] 0.9009009 0.1234293 1.0394921 1000
## [5] 0.8405923 0.1276548 0.9699059 965
## [6] 0.9143731 0.1454465 1.0550368 1196
rules %>% sort(by = "lift") %>% head %>% inspect(linebreak = FALSE)
## lhs
## [1] {MARITAL.STATUS=Married, OCCUPATION=Professional/Managerial, LANGUAGE=English}
## [2] {MARITAL.STATUS=Married, OCCUPATION=Professional/Managerial}
## [3] {DUAL.INCOMES=No, HOUSEHOLDER=Own}
## [4] {AREA=10+ years, DUAL.INCOMES=Yes, HOME.TYPE=House}
## [5] {DUAL.INCOMES=Yes, HOUSEHOLDER=Own, HOME.TYPE=House, LANGUAGE=English}
## [6] {DUAL.INCOMES=Yes, HOUSEHOLDER=Own, HOME.TYPE=House}
## rhs support confidence coverage lift count
## [1] => {DUAL.INCOMES=Yes} 0.1091960 0.8069022 0.1353275 3.281986 982
## [2] => {DUAL.INCOMES=Yes} 0.1176471 0.8033409 0.1464472 3.267501 1058
## [3] => {MARITAL.STATUS=Married} 0.1016346 0.9713071 0.1046369 2.619965 914
## [4] => {MARITAL.STATUS=Married} 0.1003002 0.9605964 0.1044145 2.591075 902
## [5] => {MARITAL.STATUS=Married} 0.1098632 0.9601555 0.1144223 2.589886 988
## [6] => {MARITAL.STATUS=Married} 0.1209830 0.9594356 0.1260981 2.587944 1088
rules %>% subset(support > 0.5) %>% sort(by = "lift") %>%
head %>% inspect(linebreak = FALSE)
## lhs rhs support confidence
## [1] {ETHNIC.CLASS=White} => {LANGUAGE=English} 0.6110308 0.9456204
## [2] {AREA=10+ years} => {LANGUAGE=English} 0.5098410 0.8847935
## [3] {UNDER18=None} => {LANGUAGE=English} 0.5609919 0.8813767
## [4] {} => {LANGUAGE=English} 0.8666741 0.8666741
## [5] {DUAL.INCOMES=Not Married} => {LANGUAGE=English} 0.5207384 0.8611622
## coverage lift count
## [1] 0.6461692 1.0910911 5495
## [2] 0.5762260 1.0209069 4585
## [3] 0.6364951 1.0169644 5045
## [4] 1.0000000 1.0000000 7794
## [5] 0.6046925 0.9936402 4683
# **Chapter 8**
## **More on R Programming**
3 %% 4
## [1] 3
3 %/% 4
## [1] 0
3/4
## [1] 0.75
1 : 2 ^ 4
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
-1 : 2
## [1] -1 0 1 2
!TRUE
## [1] FALSE
!FALSE
## [1] TRUE
x <- c(TRUE, FALSE, TRUE, FALSE)
y <- c(TRUE, TRUE, FALSE, FALSE)
x | y
## [1] TRUE TRUE TRUE FALSE
FALSE | TRUE
## [1] TRUE
FALSE || TRUE
## [1] TRUE
is.numeric(2)
## [1] TRUE
x <- as.integer(2)
is.integer(x)
## [1] TRUE
as.integer(3.2)
## [1] 3
as.character(9.99999)
## [1] "9.99999"
sqrt(as.complex(1))
## [1] 1+0i
x <- 5 > 4
x
## [1] TRUE
class(x)
## [1] "logical"
is.logical(x)
## [1] TRUE
x <- "hello, world"
class(x)
## [1] "character"
is.character(x)
## [1] TRUE
as.character(3.14)
## [1] "3.14"
rep("foo", 10)
## [1] "foo" "foo" "foo" "foo" "foo" "foo" "foo" "foo" "foo" "foo"
v <- 1:3
is.atomic(v)
## [1] TRUE
v <- 1:3
is.vector(v)
## [1] TRUE
attr(v, "foo") <- "bar"
v
## [1] 1 2 3
## attr(,"foo")
## [1] "bar"
is.vector(v)
## [1] FALSE
c(1, 2, c(3, 4), c(5, 6, 7))
## [1] 1 2 3 4 5 6 7
c(1, 2, 3, "foo") -> vec
vec
## [1] "1" "2" "3" "foo"
is.vector(vec)
## [1] TRUE
is.character(vec)
## [1] TRUE
v <- 1:6
attributes(v)
## NULL
dim(v) <- c(2, 3)
attributes(v)
## $dim
## [1] 2 3
dim(v)
## [1] 2 3
v
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
v <- 1:6
matrix(data = v, nrow = 2, ncol = 3, byrow = FALSE)
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
matrix(data = v, nrow = 2, ncol = 3, byrow = TRUE)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
(A <- matrix(1:4, nrow = 2))
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
(B <- matrix(5:8, nrow = 2))
## [,1] [,2]
## [1,] 5 7
## [2,] 6 8
A * B # element-wise mult
## [,1] [,2]
## [1,] 5 21
## [2,] 12 32
A %*% B
## [,1] [,2]
## [1,] 23 31
## [2,] 34 46
t(A)
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
solve(A) ### inverse
## [,1] [,2]
## [1,] -2 1.5
## [2,] 1 -0.5
solve(A) %*% A ### identity matrix
## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
list(1:3, 5:8)
## [[1]]
## [1] 1 2 3
##
## [[2]]
## [1] 5 6 7 8
list(1:3, c(TRUE, FALSE))
## [[1]]
## [1] 1 2 3
##
## [[2]]
## [1] TRUE FALSE
list((1:4), (5:8), (9:12))
## [[1]]
## [1] 1 2 3 4
##
## [[2]]
## [1] 5 6 7 8
##
## [[3]]
## [1] 9 10 11 12
unlist(list(1:4, 5:7))
## [1] 1 2 3 4 5 6 7
#?'%%'
#?`[[`
v <- 1:4
v[2]
## [1] 2
v[2:3]
## [1] 2 3
v[c(1,1,4,3,2)]
## [1] 1 1 4 3 2
v[-1]
## [1] 2 3 4
v[-(1:2)]
## [1] 3 4
v[v %% 2 == 0]
## [1] 2 4
v[v %% 2 == 0] <- 13
v
## [1] 1 13 3 13
m <- matrix(1:6, nrow = 2, byrow = TRUE)
m
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
m[1,]
## [1] 1 2 3
m[,1]
## [1] 1 4
m[1:2,1:2]
## [,1] [,2]
## [1,] 1 2
## [2,] 4 5
m[1,,drop = FALSE]
## [,1] [,2] [,3]
## [1,] 1 2 3
m[,1,drop = FALSE]
## [,1]
## [1,] 1
## [2,] 4
L <- list(1,2,3)
L[1]
## [[1]]
## [1] 1
L[2:3]
## [[1]]
## [1] 2
##
## [[2]]
## [1] 3
L[[1]]
## [1] 1
L[[2]]
## [1] 2
L[[3]]
## [1] 3
### Named Values
v <- c(a = 1, b = 2, c = 3, d = 4)
v
## a b c d
## 1 2 3 4
L <- list(a = 1:5, b = c(TRUE, FALSE))
L
## $a
## [1] 1 2 3 4 5
##
## $b
## [1] TRUE FALSE
names(v) <- LETTERS[1:4]
v
## A B C D
## 1 2 3 4
v["A"]
## A
## 1
L["a"]
## $a
## [1] 1 2 3 4 5
L[["a"]]
## [1] 1 2 3 4 5
L$a
## [1] 1 2 3 4 5
### Control Structures
if (10 < 12) {
print("Yes")
} else {
print("No")
}
## [1] "Yes"
x = 100
if (x < 0) {
print("Negative")
} else if (x > 0) {
print("Positive")
} else {
print("Nothing")
}
## [1] "Positive"
if (x > 0) "positive" else if (x < 0) "negative" else "zero"
## [1] "positive"
if (x > 0) {
print("positive")
}else if (x < 0){
print("negative")
}else{
print("zero")
}
## [1] "positive"
for (i in 1:10) {
print(i * i)
}
## [1] 1
## [1] 4
## [1] 9
## [1] 16
## [1] 25
## [1] 36
## [1] 49
## [1] 64
## [1] 81
## [1] 100
x <- c("foo", "bar", "baz")
for (i in seq_along(x)) {
print(i)
print(x[i])
}
## [1] 1
## [1] "foo"
## [1] 2
## [1] "bar"
## [1] 3
## [1] "baz"
for (i in seq_along(x)) {
if (i %% 2 == 0) {
next
}
print(x[i])
}
## [1] "foo"
## [1] "baz"
for (i in 1:100) {
if (i %% 2 == 0) {
next
}
if (i > 20) {
break
}
print(i)
}
## [1] 1
## [1] 3
## [1] 5
## [1] 7
## [1] 9
## [1] 11
## [1] 13
## [1] 15
## [1] 17
## [1] 19
i <- 1
while (i < 5) {
i <- i + 1
print(i)
}
## [1] 2
## [1] 3
## [1] 4
## [1] 5
i <- 1
repeat {
print(i)
i <- i + 1
if (i > 5) break
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
### Function
f <- function() {
print("hello, world")
(5 + 4) + (4 ** 2)
}
f()
## [1] "hello, world"
## [1] 25
plus <- function(x, y) {
print(paste(x, "+", y, "is", x + y))
x + y
}
plus(2, 2)
## [1] "2 + 2 is 4"
## [1] 4
div <- function(x, y) {
print(paste(x, "/", y, "is", x / y))
x / y}
div (6,2)
## [1] "6 / 2 is 3"
## [1] 3
div(3,4)
## [1] "3 / 4 is 0.75"
## [1] 0.75
div(y = 2, x = 6)
## [1] "6 / 2 is 3"
## [1] 3
pow <- function(x, y = 2) x^y
pow(2)
## [1] 4
pow(3)
## [1] 9
pow(4)
## [1] 16
pow(8)
## [1] 64
pow(9,2)
## [1] 81
pow(8,0)
## [1] 1
safer_div <- function(x, y) {
if (y == 0) {
NA
} else {
x / y
}
}
safer_div(2,4)
## [1] 0.5
safer_div(3,9)
## [1] 0.3333333
safer_div <- function(x, y) {
if (y == 0) {
return(NA)
}
x / y
}
safer_div(3,6)
## [1] 0.5
safer_div(1,0)
## [1] NA
f <- function(x, y = x^2) y + x
f(7)
## [1] 56
g <- function(x, y = x^2) { x <- 0; y + x }
g(2)
## [1] 0
g(2,9)
## [1] 9
h <- function(x, y = x^2) { y; x <- 0; y + x }
h(2)
## [1] 4
h(8)
## [1] 64
### Scoping
x <- "x"
f <- function(y) {
g <- function() c(x, y)
g()
}
f("y")
## [1] "x" "y"
x <- "x"
f <- function(y) {
g <- function() c(x, y)
y <- "z"
g()
}
f("y")
## [1] "x" "z"
x <- "x"
f <- function(y) {
g <- function() c(x, y)
g
}
g <- f("y")
g()
## [1] "x" "y"
x <- "x"
f <- function(y) {
g <- function() c(x, y)
g
}
g <- f("u")
g()
## [1] "x" "u"
h <- f("z")
h()
## [1] "x" "z"
f <- function() {
x <- NULL
set <- function(val) { x <- val }
get <- function() x
list(set = set, get = get)
}
x <- f()
x$get()
## NULL
x$set(5)
x$get()
## NULL
f <- function(){
x <- NULL
set <- function(val) { x <<- val }
get <- function() x
list(set = set, get = get)
}
x <- f()
x$get()
## NULL
x$set(5)
x$get()
## [1] 5
n <- function(x) x
f <- function(n) n(n)
f(5)
## [1] 5
f(function(x) 15)
## [1] 15
factorial <- function(n) {
if (n == 1) {
1
} else {
n * factorial(n - 1)
}
}
factorial(9)
## [1] 362880
factorial(1)
## [1] 1
merge <- function(x, y) {
if (length(x) == 0) return(y)
if (length(y) == 0) return(x)
if (x[1] < y[1]) {
c(x[1], merge(x[-1], y))
} else {
c(y[1], merge(x, y[-1]))
}
}
merge(9,0)
## [1] 0 9
merge(12,5)
## [1] 5 12
merge_sort <- function(x) {
if (length(x) < 2) return(x)
n <- length(x)
m <- n %/% 2
merge(merge_sort(x[1:m]), merge_sort(x[(m+1):n]))}
merge_sort (10)
## [1] 10
# **Chapter 9**
### **Advance R Programming**
(x <- 2 / 3)
## [1] 0.6666667
(y <- x ** 2)
## [1] 0.4444444
(x <- 1:4 / 3)
## [1] 0.3333333 0.6666667 1.0000000 1.3333333
(y <- x ** 2)
## [1] 0.1111111 0.4444444 1.0000000 1.7777778
x <- 1:5
y <- 6:10
(z <- x + y)
## [1] 7 9 11 13 15
(s = y - x)
## [1] 5 5 5 5 5
z <- vector(length = length(x))
for (i in seq_along(x)) {
z[i] <- x[i] + y[i]
}
z
## [1] 7 9 11 13 15
s = vector(length = length(x))
for (i in seq_along(x)){
s[i] = y[i] - x[i]
}
s
## [1] 5 5 5 5 5
sqrt((1:5)**2)
## [1] 1 2 3 4 5
sin(sqrt((1:5)**2))
## [1] 0.8414710 0.9092974 0.1411200 -0.7568025 -0.9589243
x <- 1:10
y <- 1:2
x + y
## [1] 2 4 4 6 6 8 8 10 10 12
x <- 1:10
ifelse(x %% 2 == 0, 5, 15)
## [1] 15 5 15 5 15 5 15 5 15 5
f <- function(x, y) sqrt(x ** y)
f(1:6, 2)
## [1] 1 2 3 4 5 6
f(1:6, 1:2)
## [1] 1.000000 2.000000 1.732051 4.000000 2.236068 6.000000
role_table <- list("Thomas" = "Instructor",
"Henrik" = "Student",
"Kristian" = "Student",
"Randi" = "Student",
"Heidi" = "Student",
"Manfred" = "Student",
"Charles" = "Researcher")
map_to_role <- function(name) role_table[[name]]
map_to_role("Thomas")
## [1] "Instructor"
map_to_role("Charles")
## [1] "Researcher"
role_table[c("Thomas", "Henrik", "Randi","Charles")]
## $Thomas
## [1] "Instructor"
##
## $Henrik
## [1] "Student"
##
## $Randi
## [1] "Student"
##
## $Charles
## [1] "Researcher"
map_to_role_2 <- function(names) unlist(role_table[names])
x <- c("Thomas", "Henrik", "Randi","Charles")
map_to_role_2(x)
## Thomas Henrik Randi Charles
## "Instructor" "Student" "Student" "Researcher"
x <- list("first" = list("second" = "foo"), "third" = "bar")
x[[c("first", "second")]]
## [1] "foo"
#x[["first"]][["second"]]
### apply
m <- matrix(1:6, nrow = 2, byrow = TRUE)
m
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
apply(m, 1, function(x) paste(x, collapse = ":")) # row-wise
## [1] "1:2:3" "4:5:6"
apply(m, 2, function(x) paste(x, collapse = ":")) # column-wise
## [1] "1:4" "2:5" "3:6"
apply(m, c(1,2), function(x) paste(x, collapse = ":")) # Both
## [,1] [,2] [,3]
## [1,] "1" "2" "3"
## [2,] "4" "5" "6"
apply(m, 1, function(x) c(x,x))
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
## [4,] 1 4
## [5,] 2 5
## [6,] 3 6
apply(m, 2, function(x) c(x,x))
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
## [3,] 1 2 3
## [4,] 4 5 6
apply(m, c(1,2), function(x) c(x,x))
## , , 1
##
## [,1] [,2]
## [1,] 1 4
## [2,] 1 4
##
## , , 2
##
## [,1] [,2]
## [1,] 2 5
## [2,] 2 5
##
## , , 3
##
## [,1] [,2]
## [1,] 3 6
## [2,] 3 6
x <- apply(m, c(1,2), function(x) c(x,x))
k <- dim(x)[3]
n <- dim(x)[2]
for (i in 1:n) {
for (j in 1:k) {
print(x[,i,j])
}
}
## [1] 1 1
## [1] 2 2
## [1] 3 3
## [1] 4 4
## [1] 5 5
## [1] 6 6
sumpow <- function(x, n) sum(x) ** n
apply(m, 1, sumpow,n = 2) # row - wise
## [1] 36 225
apply(m, 2, sumpow, n = 2) # column - wise
## [1] 25 49 81
### lapply
l <- list(1, 2, 3)
lapply(l, function(x) x**2)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 9
l <- list(a = 1, b = 2, c = 3)
lapply(l, function(x) x**2)
## $a
## [1] 1
##
## $b
## [1] 4
##
## $c
## [1] 9
lapply(1:5, function(x) x**4)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 16
##
## [[3]]
## [1] 81
##
## [[4]]
## [1] 256
##
## [[5]]
## [1] 625
lapply(list(a = 1:3, b = 4:6), function(x) x**2)
## $a
## [1] 1 4 9
##
## $b
## [1] 16 25 36
### sapply
sapply(1:3, function(x) x**2) # returns vector by default
## [1] 1 4 9
vapply(1:3, function(x) x**2,1)
## [1] 1 4 9
#?vapply
### Advance Functions
`+`(2, 2)
## [1] 4
v <- 1:4
names(v) <- c("a", "b", "c", "d")
v
## a b c d
## 1 2 3 4
`foo<-` <- function(x, value) {
x$foo <- value
x
}
`bar<-` <- function(x, value) {
x$bar <- value
x
}
x <- list(foo = 1, bar = 2)
x$foo
## [1] 1
x$bar
## [1] 2
foo(x) <- 4
x$foo
## [1] 4
bar(x) <- 3
x$bar
## [1] 3
y <- x
foo(x) <- 5
x
## $foo
## [1] 5
##
## $bar
## [1] 3
y
## $foo
## [1] 4
##
## $bar
## [1] 3
`foo<-` <- function(y, value) {
y$foo <- value
y
}
x <- list(foo = 1, bar = 2)
foo(x) <- 3
x$foo
## [1] 3
`modify<-` <- function(x, variable, value) {
x[variable] <- value
x
}
x <- list(foo = 1, bar = 2)
modify(x, "foo") <- 5
modify(x, "bar") <- 7
x$foo
## [1] 5
x$bar
## [1] 7
x <- 1:4
f <- function(x) {
x[2] <- 5 # insert 5 at index 2
x[1] <- 9 # insert 9 at index 1
x[4] <- 10 # insert 10 at index 4
x
}
x
## [1] 1 2 3 4
f(x)
## [1] 9 5 3 10
square <- function(x) (x**2 - x)
square(5)
## [1] 20
m <- matrix(1:6, nrow = 3)
m
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
sum_of_squares <- function(x) sum(x^2)
apply(m, 1, sum_of_squares) # row - wise
## [1] 17 29 45
apply(m,2 , sum_of_squares) # column - wise
## [1] 14 77
#apply(m^2, 1, sum)
apply_if <- function(x, p, f) {
result <- vector(length = length(x))
n <- 0
for (i in seq_along(x)) {
if (p(x[i])) {
n <- n + 1
result[n] <- f(x[i])
}
}
head(result, n)
}
apply_if(1:8, function(x) x %% 2 == 0, function(x) x^2)
## [1] 4 16 36 64
power <- function(n) function(x) x**n
square <- power(2)
cube <- power(3)
x <- 1:4
square(x)
## [1] 1 4 9 16
cube(x)
## [1] 1 8 27 64
### Filter, Map and Reduce
is_even <- function(x) x %% 2 == 0
Filter(is_even, 1:8)
## [1] 2 4 6 8
Filter(is_even, unlist(1:8))
## [1] 2 4 6 8
Filter(is_even, as.list(1:8))
## [[1]]
## [1] 2
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 6
##
## [[4]]
## [1] 8
divi <- function(x) (x %% 3 == 0)
Filter(divi, 1:12)
## [1] 3 6 9 12
square <- function(x) x^2
Map(square, 1:4)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 9
##
## [[4]]
## [1] 16
unlist(Map(square, 1:4))
## [1] 1 4 9 16
plus <- function(x, y) x + y
unlist(Map(plus, 0:3, 3:0))
## [1] 3 3 3 3
add_parenthesis <- function(a, b) paste("(", a, ", ", b, ")", sep = "")
Reduce(add_parenthesis, 1:4)
## [1] "(((1, 2), 3), 4)"
mysum <- function(x) Reduce(`+`, x)
mysum(1:4)
## [1] 10
### Function Operations
cached <- function(f) {
force(f)
table <- list()
function(n) {
key <- as.character(n)
if (key %in% names(table)) {
print(paste("I have already computed the value for", n))
table[[key]]
} else {
print(paste("Going to compute the value for", n))
res <- f(n)
print(paste("That turned out to be", res))
table[key] <<- res
print(table)
res
}
}
}
factorial <- function(n) {
if (n == 1) {
1
} else {
n * factorial(n - 1)
}
}
factorial <- cached(factorial)
factorial(4)
## [1] "Going to compute the value for 4"
## [1] "Going to compute the value for 3"
## [1] "Going to compute the value for 2"
## [1] "Going to compute the value for 1"
## [1] "That turned out to be 1"
## $`1`
## [1] 1
##
## [1] "That turned out to be 2"
## $`1`
## [1] 1
##
## $`2`
## [1] 2
##
## [1] "That turned out to be 6"
## $`1`
## [1] 1
##
## $`2`
## [1] 2
##
## $`3`
## [1] 6
##
## [1] "That turned out to be 24"
## $`1`
## [1] 1
##
## $`2`
## [1] 2
##
## $`3`
## [1] 6
##
## $`4`
## [1] 24
## [1] 24
factorial(1)
## [1] "I have already computed the value for 1"
## [1] 1
factorial(2)
## [1] "I have already computed the value for 2"
## [1] 2
factorial(3)
## [1] "I have already computed the value for 3"
## [1] 6
factorial(4)
## [1] "I have already computed the value for 4"
## [1] 24
factorial(5)
## [1] "Going to compute the value for 5"
## [1] "I have already computed the value for 4"
## [1] "That turned out to be 120"
## $`1`
## [1] 1
##
## $`2`
## [1] 2
##
## $`3`
## [1] 6
##
## $`4`
## [1] 24
##
## $`5`
## [1] 120
## [1] 120
fibonacci <- function(n) {
if (n == 1 || n == 2) {
1
} else {
fibonacci(n - 1) + fibonacci(n - 2)
}
}
fibonacci <- cached(fibonacci)
fibonacci(4)
## [1] "Going to compute the value for 4"
## [1] "Going to compute the value for 3"
## [1] "Going to compute the value for 2"
## [1] "That turned out to be 1"
## $`2`
## [1] 1
##
## [1] "Going to compute the value for 1"
## [1] "That turned out to be 1"
## $`2`
## [1] 1
##
## $`1`
## [1] 1
##
## [1] "That turned out to be 2"
## $`2`
## [1] 1
##
## $`1`
## [1] 1
##
## $`3`
## [1] 2
##
## [1] "I have already computed the value for 2"
## [1] "That turned out to be 3"
## $`2`
## [1] 1
##
## $`1`
## [1] 1
##
## $`3`
## [1] 2
##
## $`4`
## [1] 3
## [1] 3
fibonacci(1)
## [1] "I have already computed the value for 1"
## [1] 1
fibonacci(2)
## [1] "I have already computed the value for 2"
## [1] 1
fibonacci(3)
## [1] "I have already computed the value for 3"
## [1] 2
fibonacci(4)
## [1] "I have already computed the value for 4"
## [1] 3
fibonacci(5)
## [1] "Going to compute the value for 5"
## [1] "I have already computed the value for 4"
## [1] "I have already computed the value for 3"
## [1] "That turned out to be 5"
## $`2`
## [1] 1
##
## $`1`
## [1] 1
##
## $`3`
## [1] 2
##
## $`4`
## [1] 3
##
## $`5`
## [1] 5
## [1] 5
?fibonacci
## No documentation for 'fibonacci' in specified packages and libraries:
## you could try '??fibonacci'
### Ellipsis Parameters
g <- function(a, b, ...) NULL
g(a = 1, b = 2, c = 3)
## NULL
tolist <- function(...) list(...)
tolist()
## list()
tolist(a = 1)
## $a
## [1] 1
tolist(a = 1, b = 3)
## $a
## [1] 1
##
## $b
## [1] 3
tolist(a = 1, b = 3, c = 4,d = 12, e = 23)
## $a
## [1] 1
##
## $b
## [1] 3
##
## $c
## [1] 4
##
## $d
## [1] 12
##
## $e
## [1] 23
### System Timeout
time_out <- function(f) {
force(f)
function(...) {
system.time(f(...))
}
}
ti_mean <- time_out(mean)
ti_mean(runif(1e6))
## user system elapsed
## 0.03 0.02 0.03
# **Chapter 10**
## **Object Oriented Programming**
### **Bayesian Linear Models**
blm <- function(model, alpha = 1, beta = 1, ...) {
# Here goes the mathematics for computing the fit.
frame <- model.frame(model, ...)
phi <- model.matrix(frame)
no_params <- ncol(phi)
target <- model.response(frame)
covar <- solve(diag(alpha, no_params) + beta * t(phi) %*% phi)
mean <- beta * covar %*% t(phi) %*% target
list(formula = model,
frame = frame,
mean = mean,
covar = covar)
}
# fake some data for our linear model
x <- rnorm(10)
a <- 1 ; b <- 1.3
w0 <- 0.2 ; w1 <- 3
y <- rnorm(10, mean = w0 + w1 * x, sd = sqrt(1/b))
#frame <- data.frame (x_1 = x, y_1 = y )
#frame
# fit a model
model <- blm(y ~ x, alpha = a, beta = b)
model
## $formula
## y ~ x
##
## $frame
## y x
## 1 -3.2696915 -1.1369671
## 2 -4.3496517 -1.5037810
## 3 -3.8920238 -1.8113726
## 4 -0.6998128 -0.4769833
## 5 -5.3577295 -1.3622658
## 6 -4.1737824 -1.8095640
## 7 -4.0190159 -1.1597942
## 8 -7.9212743 -2.2600729
## 9 6.4632952 1.5249919
## 10 4.2227425 1.3204692
##
## $mean
## [,1]
## (Intercept) 0.4022186
## x 3.1501395
##
## $covar
## (Intercept) x
## (Intercept) 0.10168968 0.03756497
## x 0.03756497 0.04663170
class(model)
## [1] "list"
class(model) <- "blm"
class(model)
## [1] "blm"
blm <- function(model, alpha = 1, beta = 1, ...) {
# stuff happens here...
object <- list(formula = model,
frame = frame,
mean = mean,
covar = covar)
class(object) <- "blm"
object
}
blm <- function(model, alpha = 1, beta = 1, ...) {
# stuff happens here...
structure(list(formula = model,
frame = frame,
mean = mean,
covar = covar),
class = "blm")
}
model
## $formula
## y ~ x
##
## $frame
## y x
## 1 -3.2696915 -1.1369671
## 2 -4.3496517 -1.5037810
## 3 -3.8920238 -1.8113726
## 4 -0.6998128 -0.4769833
## 5 -5.3577295 -1.3622658
## 6 -4.1737824 -1.8095640
## 7 -4.0190159 -1.1597942
## 8 -7.9212743 -2.2600729
## 9 6.4632952 1.5249919
## 10 4.2227425 1.3204692
##
## $mean
## [,1]
## (Intercept) 0.4022186
## x 3.1501395
##
## $covar
## (Intercept) x
## (Intercept) 0.10168968 0.03756497
## x 0.03756497 0.04663170
##
## attr(,"class")
## [1] "blm"
### Polymorphic Function
print.blm <- function(x, ...) {
print(x$formula)
}
model
## y ~ x
#??print
### Defining your own Function
Charles <- function(x, ...) UseMethod("Charles")
Charles.default <- function(x, ...) print("default Charles")
Charles("a string")
## [1] "default Charles"
#Charles(12)
Charles.blm <- function(x, ...) print("blm Charles")
Charles(model)
## [1] "blm Charles"
Charles.blm <- function(x, upper = FALSE, ...) {
if (upper) {
print("BLM FOO")
} else {
print("blm foo")
}
}
Charles("a string")
## [1] "default Charles"
Charles(model)
## [1] "blm foo"
Charles("a string", upper = TRUE)
## [1] "default Charles"
Charles(model, upper = TRUE)
## [1] "BLM FOO"
foo <- function(object, ...) UseMethod("foo")
foo.default <- function(object, ...) stop("foo not implemented")
bar <- function(object, ...) UseMethod("bar")
bar.default <- function(object, ...) stop("bar not implemented")
A <- function(f, b) structure(list(foo = f, bar = b), class ="A")
foo.A <- function(object, ...) paste("A::foo ->", object$foo)
bar.A <- function(object, ...) paste("A::bar ->", object$bar)
a <- A("Charles", "Nana")
foo(a)
## [1] "A::foo -> Charles"
bar(a)
## [1] "A::bar -> Nana"
baz <- function(object, ...) UseMethod("baz")
baz.default <- function(object, ...) stop("baz not implemented")
B <- function(f, b, bb) {
a <- A(f, b)
a$baz <- bb
class(a) <- "B"
a
}
bar.B <- function(object, ...) paste("B::bar ->", object$bar)
baz.B <- function(object, ...) paste("B::baz ->", object$baz)
B <- function(f, b, bb) {
a <- A(f, b)
a$baz <- bb
class(a) <- c("B", "A")
a
}
b <- B("Charles", "Kwame", "Appiah")
foo(b)
## [1] "A::foo -> Charles"
bar(b)
## [1] "B::bar -> Kwame"
baz(b)
## [1] "B::baz -> Appiah"
C <- function(f, b, bb) {
b <- B(f, b, bb)
class(b) <- c("C", "B", "A")
b
}
c <- C("foo", "bar", "baz")
foo(c)
## [1] "A::foo -> foo"
bar(c)
## [1] "B::bar -> bar"
baz(c)
## [1] "B::baz -> baz"
C <- function(f, b, bb) {
b <- B(f, b, bb)
class(b) <- c("C", class(b))
b
}
# **Chapter 12**
## Testing R packages
area <- function(x) UseMethod("area")
circumference <- function(x) UseMethod("circumference")
rectangle <- function(width, height) {
structure(list(width = width, height = height),
class = c("rectangle", "shape"))
}
area.rectangle <- function(x) x$height * x$width
circumference.rectangle <- function(x) 2 * x$height + 2 * x$width
r <- rectangle(width = 3, height = 4)
area(r)
## [1] 12
circumference(r)
## [1] 14
# Automating testing
r <- rectangle(width = 3, height = 4)
if (area(r) != 3*4) {
stop("Area not computed correctly!")
}
if (circumference(r) != 2*3 + 2*4) {
stop("Circumference not computed correctly!")
}
## **Chapter 14**
### Profiling and Optimizing
library(profvis)
graph <- function(n, edges) {
m <- matrix(0, nrow = n, ncol = n)
no_edges <- length(edges)
if (no_edges >= 1) {
for (i in seq(1, no_edges, by = 2)) {
m[edges[i], edges[i+1]] <- m[edges[i+1], edges[i]] <- 1
}
}
structure(m, class = "graph")
}
smooth_weights <- function(graph, node_weights, alpha) {
if (length(node_weights) != nrow(graph))
stop("Incorrect number of nodes")
no_nodes <- length(node_weights)
new_weights <- vector("numeric", no_nodes)
for (i in 1:no_nodes) {
neighbour_weights <- 0
n <- 0
for (j in 1:no_nodes) {
if (i != j && graph[i, j] == 1) {
neighbour_weights <- neighbour_weights + node_weights[j]
n <- n + 1
}
}
if (n > 0) {
new_weights[i] <-
alpha * node_weights[i] +
(1 - alpha) * neighbour_weights / n
} else {
new_weights[i] <- node_weights[i]
}
}
new_weights
}
profvis::profvis({
n <- 1000
nodes <- 1:n
edges <- sample(nodes, 600, replace = TRUE)
weights <- rnorm(n)
g <- graph(n, edges)
smooth_weights(g, weights, 0.8)
})
graph <- function(n, edges) {
neighbours <- vector("list", length = n)
for (i in seq_along(neighbours)) {
neighbours[[i]] <- vector("integer", length = 0)
}
no_edges <- length(edges)
if (no_edges >= 1) {
for (i in seq(1, no_edges, by = 2)) {
n1 <- edges[i]
n2 <- edges[i+1]
neighbours[[n1]] <- c(n2, neighbours[[n1]])
neighbours[[n2]] <- c(n1, neighbours[[n2]])
}
}
for (i in seq_along(neighbours)) {
neighbours[[i]] <- unique(neighbours[[i]])
}
structure(neighbours, class = "graph")
}
smooth_weights <- function(graph, node_weights, alpha) {
if (length(node_weights) != length(graph))
stop("Incorrect number of nodes")
no_nodes <- length(node_weights)
new_weights <- vector("numeric", no_nodes)
for (i in 1:no_nodes) {
neighbour_weights <- 0
n <- 0
for (j in graph[[i]]) {
if (i != j) {
neighbour_weights <- neighbour_weights + node_weights[j]
n <- n + 1
}
}
if (n > 0) {
new_weights[i] <-
alpha * node_weights[i] +
(1 - alpha) * neighbour_weights / n
} else {
new_weights[i] <- node_weights[i]
}
}
new_weights
}
profvis::profvis({
n <- 10000
nodes <- 1:n
edges <- sample(nodes, 600, replace = TRUE)
weights <- rnorm(n)
g <- graph(n, edges)
smooth_weights(g, weights, 0.8)
})
graph <- function(n, edges) {
neighbours <- vector("list", length = n)
for (i in seq_along(neighbours)) {
neighbours[[i]] <- vector("integer", length = 0)
}
no_edges <- length(edges)
if (no_edges >= 1) {
sources <- seq(1, no_edges, by = 2)
destinations <- seq(2, no_edges, by = 2)
edge_matrix <- matrix(NA, nrow = length(sources), ncol = 2)
edge_matrix[,1] <- edges[sources]
edge_matrix[,2] <- edges[destinations]
for (i in 1:nrow(edge_matrix)) {
if (edge_matrix[i,1] > edge_matrix[i,2]) {
edge_matrix[i,] <- c(edge_matrix[i,2], edge_matrix[i,1])
}
}
edge_matrix <- unique(edge_matrix)
for (i in seq(1, nrow(edge_matrix))) {
n1 <- edge_matrix[i, 1]
n2 <- edge_matrix[i, 2]
neighbours[[n1]] <- c(n2, neighbours[[n1]])
neighbours[[n2]] <- c(n1, neighbours[[n2]])
}
}
structure(neighbours, class = "graph")
}
profvis::profvis({
n <- 20000
nodes <- 1:n
edges <- sample(nodes, 50000, replace = TRUE)
weights <- rnorm(n)
g <- graph(n, edges)
smooth_weights(g, weights, 0.8)
})
flow_weights_iteration <- function(graph, node_weights, alpha) {
if (length(node_weights) != length(graph))
stop("Incorrect number of nodes")
no_nodes <- length(node_weights)
new_weights <- vector("numeric", n)
for (i in 1:no_nodes) {
neighbour_weights <- 0
n <- 0
for (j in graph[[i]]) {
if (i != j) {
neighbour_weights <- neighbour_weights + node_weights[j]
n <- n + 1
}
}
if (n > 0) {
new_weights[i] <- (alpha * node_weights[i] + (1 - alpha)
* neighbour_weights / n)
} else {
new_weights[i] <- node_weights[i]
}
}
new_weights
}
smooth_weights <- function(graph, node_weights, alpha, no_iterations) {
new_weights <- node_weights
replicate(no_iterations, {
new_weights <- flow_weights_iteration(graph, new_weights, alpha)
})
new_weights
}
profvis::profvis({
n <- 20000
nodes <- 1:n
edges <- sample(nodes, 100000, replace = TRUE)
weights <- rnorm(n)
g <- graph(n, edges)
smooth_weights(g, weights, 0.8, 10)
})
smooth_weights <- function(graph, node_weights,
alpha, no_iterations) {
new_weights <- node_weights
for (i in 1:no_iterations) {
new_weights <-
smooth_weights_iteration(graph, new_weights, alpha)
}
new_weights
}
### Increasing Speed
library(microbenchmark)
mysum <- function(sequence) {
s <- 0
for (x in sequence) s <- s + x
s
}
microbenchmark(
sum(1:10),
mysum(1:10)
)
## Unit: nanoseconds
## expr min lq mean median uq max neval cld
## sum(1:10) 300 400 417 400 400 2200 100 a
## mysum(1:10) 1100 1100 38814 1200 1200 3763600 100 a
microbenchmark(
sum(1:10),
mysum(1:10),
Reduce(`+`, 1:10)
)
## Unit: nanoseconds
## expr min lq mean median uq max neval cld
## sum(1:10) 300 500 756 700 900 3400 100 a
## mysum(1:10) 1200 1500 2152 2200 2500 6100 100 a
## Reduce(`+`, 1:10) 8700 12400 14948 14200 15250 68000 100 b
x <- sample(LETTERS, 1000, replace = TRUE)
microbenchmark(
factor(x, levels = LETTERS),
factor(x)
)
## Unit: microseconds
## expr min lq mean median uq max neval cld
## factor(x, levels = LETTERS) 27.5 28.8 36.113 29.85 36.9 131.9 100 a
## factor(x) 114.9 120.9 136.633 128.35 143.5 269.8 100 b
x <- rnorm(1000)
names(x) <- paste("n", 1:1000)
microbenchmark(
unlist(Map(function(x) x**2, x), use.names = FALSE),
unlist(Map(function(x) x**2, x))
)
## Unit: microseconds
## expr min lq mean
## unlist(Map(function(x) x^2, x), use.names = FALSE) 941.1 1068.4 1380.142
## unlist(Map(function(x) x^2, x)) 1012.4 1132.4 1358.209
## median uq max neval cld
## 1157.65 1508.65 7638.5 100 a
## 1230.05 1545.60 2174.0 100 a
smooth_weights_iteration_map <- function(graph, node_weights, alpha) {
if (length(node_weights) != length(graph))
stop("Incorrect number of nodes")
handle_i <- function(i) {
neighbour_weights <- 0
n <- 0
for (j in graph[[i]]) {
if (i != j) {
neighbour_weights <- neighbour_weights + node_weights[j]
n <- n + 1
}
}
if (n > 0) {
alpha * node_weights[i] + (1 - alpha) * neighbour_weights / n
} else {
node_weights[i]
}
}
unlist(Map(handle_i, 1:length(node_weights)))
}
library(cluster) test_rmse <- function(data) { model <- data\(training %>% lm(dist ~ speed, data = .) predictions <- data\)test %>% predict(model, data = .) rmse(data\(test\)dist, predictions) } sample_rmse <- function (n) { random_cars <- cars %>% partition(n, c(training = 0.6, test = 0.4)) unlist(Map(test_rmse, random_cars)) } sample_rmse_parallel <- function (n) { random_cars <- cars %>% partition(n, c(training = 0.6, test = 0.4)) unlist(clustermq(cl, test_rmse, random_cars)) }
microbenchmark( sample_rmse(10), sample_rmse_parallel(10), times = 5 )
microbenchmark( sample_rmse(1000), sample_rmse_parallel(1000), times = 5 )
library(Rcpp)
cppFunction("
NumericVector
smooth_weights_iteration_cpp(List g,
NumericVector node_weights,
double alpha)
{
NumericVector new_weights(g.length());
for (int i = 0; i < g.length(); ++i) {
// The body here is just the C++ code
// shown above...
}
return new_weights;
}
")
smooth_weights_cpp <- function(graph, node_weights,
alpha, no_iterations) {
new_weights <- node_weights
for (i in 1:no_iterations) {
new_weights <-
smooth_weights_iteration_cpp(graph, new_weights, alpha)
}
new_weights
}
library(microbenchmark) microbenchmark( smooth_weights(g, weights, 0.8, 10), smooth_weights_cpp(g, weights, 0.8, 10), times = 5 )
library(rstudioapi)
viewer <- getOption("viewer")
viewer("R-with Data Science.Rmd", height = "maximize")
## NULL
#rstudioapi::navigateToFile("")